Total coverage: 73945 (5%)of 1820798
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 /* SPDX-License-Identifier: GPL-2.0 */ /* taskstats_kern.h - kernel header for per-task statistics interface * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 * (C) Balbir Singh, IBM Corp. 2006 */ #ifndef _LINUX_TASKSTATS_KERN_H #define _LINUX_TASKSTATS_KERN_H #include <linux/taskstats.h> #include <linux/sched/signal.h> #include <linux/slab.h> #ifdef CONFIG_TASKSTATS extern struct kmem_cache *taskstats_cache; extern struct mutex taskstats_exit_mutex; static inline void taskstats_tgid_free(struct signal_struct *sig) { if (sig->stats) kmem_cache_free(taskstats_cache, sig->stats); } extern void taskstats_exit(struct task_struct *, int group_dead); extern void taskstats_init_early(void); #else static inline void taskstats_exit(struct task_struct *tsk, int group_dead) {} static inline void taskstats_tgid_free(struct signal_struct *sig) {} static inline void taskstats_init_early(void) {} #endif /* CONFIG_TASKSTATS */ #endif
80 1 80 80 53 53 75 80 80 81 69 12 79 1 80 81 81 42 2 1 1 29 13 42 42 42 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/skbuff.h> #include <linux/sctp.h> #include <net/gso.h> #include <net/gro.h> /** * skb_eth_gso_segment - segmentation handler for ethernet protocols. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @type: Ethernet Protocol ID */ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; rcu_read_lock(); list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); return segs; } EXPORT_SYMBOL(skb_eth_gso_segment); /** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment * @features: features for the output path (see dev->features) */ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; int vlan_depth = skb->mac_len; __be16 type = skb_network_protocol(skb, &vlan_depth); if (unlikely(!type)) return ERR_PTR(-EINVAL); __skb_pull(skb, vlan_depth); rcu_read_lock(); list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); __skb_push(skb, skb->data - skb_mac_header(skb)); return segs; } EXPORT_SYMBOL(skb_mac_gso_segment); /* openvswitch calls this on rx path, so we need a different check. */ static bool skb_needs_check(const struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } /** * __skb_gso_segment - Perform segmentation on skb. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @tx_path: whether it is called in TX path * * This function segments the given skb and returns a list of segments. * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. * * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) { struct sk_buff *segs; if (unlikely(skb_needs_check(skb, tx_path))) { int err; /* We're going to init ->check field in TCP or UDP header */ err = skb_cow_head(skb, 0); if (err < 0) return ERR_PTR(err); } /* Only report GSO partial support if it will enable us to * support segmentation on this frame without needing additional * work. */ if (features & NETIF_F_GSO_PARTIAL) { netdev_features_t partial_features = NETIF_F_GSO_ROBUST; struct net_device *dev = skb->dev; partial_features |= dev->features & dev->gso_partial_features; if (!skb_gso_ok(skb, features | partial_features)) features &= ~NETIF_F_GSO_PARTIAL; } BUILD_BUG_ON(SKB_GSO_CB_OFFSET + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); SKB_GSO_CB(skb)->encap_level = 0; skb_reset_mac_header(skb); skb_reset_mac_len(skb); segs = skb_mac_gso_segment(skb, features); if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; } EXPORT_SYMBOL(__skb_gso_segment); /** * skb_gso_transport_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_transport_seglen is used to determine the real size of the * individual segments, including Layer4 headers (TCP/UDP). * * The MAC/L2 or network (IP, IPv6) headers are not accounted for. */ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int thlen = 0; if (skb->encapsulation) { thlen = skb_inner_transport_header(skb) - skb_transport_header(skb); if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) thlen += inner_tcp_hdrlen(skb); } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { thlen = tcp_hdrlen(skb); } else if (unlikely(skb_is_gso_sctp(skb))) { thlen = sizeof(struct sctphdr); } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { thlen = sizeof(struct udphdr); } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already * accounted for. */ return thlen + shinfo->gso_size; } /** * skb_gso_network_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_network_seglen is used to determine the real size of the * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). * * The MAC/L2 header is not accounted for. */ static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_network_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } /** * skb_gso_mac_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_mac_seglen is used to determine the real size of the * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 * headers (TCP/UDP). */ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } /** * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS * * There are a couple of instances where we have a GSO skb, and we * want to determine what size it would be after it is segmented. * * We might want to check: * - L3+L4+payload size (e.g. IP forwarding) * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) * * This is a helper to do that correctly considering GSO_BY_FRAGS. * * @skb: GSO skb * * @seg_len: The segmented length (from skb_gso_*_seglen). In the * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. * * @max_len: The maximum permissible length. * * Returns true if the segmented length <= max length. */ static inline bool skb_gso_size_check(const struct sk_buff *skb, unsigned int seg_len, unsigned int max_len) { const struct skb_shared_info *shinfo = skb_shinfo(skb); const struct sk_buff *iter; if (shinfo->gso_size != GSO_BY_FRAGS) return seg_len <= max_len; /* Undo this so we can re-use header sizes */ seg_len -= GSO_BY_FRAGS; skb_walk_frags(skb, iter) { if (seg_len + skb_headlen(iter) > max_len) return false; } return true; } /** * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? * * @skb: GSO skb * @mtu: MTU to validate against * * skb_gso_validate_network_len validates if a given skb will fit a * wanted MTU once split. It considers L3 headers, L4 headers, and the * payload. */ bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) { return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); } EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); /** * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? * * @skb: GSO skb * @len: length to validate against * * skb_gso_validate_mac_len validates if a given skb will fit a wanted * length once split, including L2, L3 and L4 headers and the payload. */ bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) { return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); } EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0-only */ /* * File: pep.h * * Phonet Pipe End Point sockets definitions * * Copyright (C) 2008 Nokia Corporation. */ #ifndef NET_PHONET_PEP_H #define NET_PHONET_PEP_H #include <linux/skbuff.h> #include <net/phonet/phonet.h> struct pep_sock { struct pn_sock pn_sk; /* XXX: union-ify listening vs connected stuff ? */ /* Listening socket stuff: */ struct hlist_head hlist; /* Connected socket stuff: */ struct sock *listener; struct sk_buff_head ctrlreq_queue; #define PNPIPE_CTRLREQ_MAX 10 atomic_t tx_credits; int ifindex; u16 peer_type; /* peer type/subtype */ u8 pipe_handle; u8 rx_credits; u8 rx_fc; /* RX flow control */ u8 tx_fc; /* TX flow control */ u8 init_enable; /* auto-enable at creation */ u8 aligned; }; static inline struct pep_sock *pep_sk(struct sock *sk) { return (struct pep_sock *)sk; } extern const struct proto_ops phonet_stream_ops; /* Pipe protocol definitions */ struct pnpipehdr { u8 utid; /* transaction ID */ u8 message_id; u8 pipe_handle; union { u8 state_after_connect; /* connect request */ u8 state_after_reset; /* reset request */ u8 error_code; /* any response */ u8 pep_type; /* status indication */ u8 data0; /* anything else */ }; u8 data[]; }; #define other_pep_type data[0] static inline struct pnpipehdr *pnp_hdr(struct sk_buff *skb) { return (struct pnpipehdr *)skb_transport_header(skb); } #define MAX_PNPIPE_HEADER (MAX_PHONET_HEADER + 4) enum { PNS_PIPE_CREATE_REQ = 0x00, PNS_PIPE_CREATE_RESP, PNS_PIPE_REMOVE_REQ, PNS_PIPE_REMOVE_RESP, PNS_PIPE_DATA = 0x20, PNS_PIPE_ALIGNED_DATA, PNS_PEP_CONNECT_REQ = 0x40, PNS_PEP_CONNECT_RESP, PNS_PEP_DISCONNECT_REQ, PNS_PEP_DISCONNECT_RESP, PNS_PEP_RESET_REQ, PNS_PEP_RESET_RESP, PNS_PEP_ENABLE_REQ, PNS_PEP_ENABLE_RESP, PNS_PEP_CTRL_REQ, PNS_PEP_CTRL_RESP, PNS_PEP_DISABLE_REQ = 0x4C, PNS_PEP_DISABLE_RESP, PNS_PEP_STATUS_IND = 0x60, PNS_PIPE_CREATED_IND, PNS_PIPE_RESET_IND = 0x63, PNS_PIPE_ENABLED_IND, PNS_PIPE_REDIRECTED_IND, PNS_PIPE_DISABLED_IND = 0x66, }; #define PN_PIPE_INVALID_HANDLE 0xff #define PN_PEP_TYPE_COMMON 0x00 /* Phonet pipe status indication */ enum { PN_PEP_IND_FLOW_CONTROL, PN_PEP_IND_ID_MCFC_GRANT_CREDITS, }; /* Phonet pipe error codes */ enum { PN_PIPE_NO_ERROR, PN_PIPE_ERR_INVALID_PARAM, PN_PIPE_ERR_INVALID_HANDLE, PN_PIPE_ERR_INVALID_CTRL_ID, PN_PIPE_ERR_NOT_ALLOWED, PN_PIPE_ERR_PEP_IN_USE, PN_PIPE_ERR_OVERLOAD, PN_PIPE_ERR_DEV_DISCONNECTED, PN_PIPE_ERR_TIMEOUT, PN_PIPE_ERR_ALL_PIPES_IN_USE, PN_PIPE_ERR_GENERAL, PN_PIPE_ERR_NOT_SUPPORTED, }; /* Phonet pipe states */ enum { PN_PIPE_DISABLE, PN_PIPE_ENABLE, }; /* Phonet pipe sub-block types */ enum { PN_PIPE_SB_CREATE_REQ_PEP_SUB_TYPE, PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE, PN_PIPE_SB_REDIRECT_REQ_PEP_SUB_TYPE, PN_PIPE_SB_NEGOTIATED_FC, PN_PIPE_SB_REQUIRED_FC_TX, PN_PIPE_SB_PREFERRED_FC_RX, PN_PIPE_SB_ALIGNED_DATA, }; /* Phonet pipe flow control models */ enum { PN_NO_FLOW_CONTROL, PN_LEGACY_FLOW_CONTROL, PN_ONE_CREDIT_FLOW_CONTROL, PN_MULTI_CREDIT_FLOW_CONTROL, PN_MAX_FLOW_CONTROL, }; #define pn_flow_safe(fc) ((fc) >> 1) /* Phonet pipe flow control states */ enum { PEP_IND_EMPTY, PEP_IND_BUSY, PEP_IND_READY, }; #endif
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 /* SPDX-License-Identifier: GPL-2.0 */ /* * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.rst for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H #include <linux/lockdep_types.h> #include <linux/smp.h> #include <asm/percpu.h> struct task_struct; #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> #include <linux/list.h> #include <linux/debug_locks.h> #include <linux/stacktrace.h> static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { int i; *to = *from; /* * Since the class cache can be modified concurrently we could observe * half pointers (64bit arch using 32bit copy insns). Therefore clear * the caches and take the performance hit. * * XXX it doesn't work well with lockdep_set_class_and_subclass(), since * that relies on cache abuse. */ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) to->class_cache[i] = NULL; } /* * Every lock has a list of other locks that were taken after it. * We only grow the list, never remove from it: */ struct lock_list { struct list_head entry; struct lock_class *class; struct lock_class *links_to; const struct lock_trace *trace; u16 distance; /* bitmap of different dependencies from head to this */ u8 dep; /* used by BFS to record whether "prev -> this" only has -(*R)-> */ u8 only_xr; /* * The parent field is used to implement breadth-first search, and the * bit 0 is reused to indicate if the lock has been accessed in BFS. */ struct lock_list *parent; }; /** * struct lock_chain - lock dependency chain record * * @irq_context: the same as irq_context in held_lock below * @depth: the number of held locks in this chain * @base: the index in chain_hlocks for this chain * @entry: the collided lock chains in lock_chain hash list * @chain_key: the hash key of this lock_chain */ struct lock_chain { /* see BUILD_BUG_ON()s in add_chain_cache() */ unsigned int irq_context : 2, depth : 6, base : 24; /* 4 byte hole */ struct hlist_node entry; u64 chain_key; }; /* * Initialization, self-test and debugging-output methods: */ extern void lockdep_init(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); /* * Split the recursion counter in two to readily detect 'off' vs recursion. */ #define LOCKDEP_RECURSION_BITS 16 #define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) #define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) /* * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due * to header dependencies. */ #define lockdep_off() \ do { \ current->lockdep_recursion += LOCKDEP_OFF; \ } while (0) #define lockdep_on() \ do { \ current->lockdep_recursion -= LOCKDEP_OFF; \ } while (0) extern void lockdep_register_key(struct lock_class_key *key); extern void lockdep_unregister_key(struct lock_class_key *key); /* * These methods are used by specific locking variants (spinlocks, * rwlocks, mutexes and rwsems) to pass init/acquire/release events * to lockdep: */ extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type); static inline void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer) { lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL); } static inline void lockdep_init_map_wait(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner) { lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV); } static inline void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV); } /* * Reinitialize a lock key - for cases where there is special locking or * special initialization of locks so that the validator gets the scope * of dependencies wrong: they are either too broad (they need a class-split) * or they are too narrow (they suffer from a false class-split): */ #define lockdep_set_class(lock, key) \ lockdep_init_map_type(&(lock)->dep_map, #key, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_class_and_name(lock, key, name) \ lockdep_init_map_type(&(lock)->dep_map, name, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_class_and_subclass(lock, key, sub) \ lockdep_init_map_type(&(lock)->dep_map, #key, key, sub, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_subclass(lock, sub) \ lockdep_init_map_type(&(lock)->dep_map, (lock)->dep_map.name, (lock)->dep_map.key, sub,\ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) /** * lockdep_set_novalidate_class: disable checking of lock ordering on a given * lock * @lock: Lock to mark * * Lockdep will still record that this lock has been taken, and print held * instances when dumping locks */ #define lockdep_set_novalidate_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) /** * lockdep_set_notrack_class: disable lockdep tracking of a given lock entirely * @lock: Lock to mark * * Bigger hammer than lockdep_set_novalidate_class: so far just for bcachefs, * which takes more locks than lockdep is able to track (48). */ #define lockdep_set_notrack_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_track__, #lock) /* * Compare locking classes */ #define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key) static inline int lockdep_match_key(struct lockdep_map *lock, struct lock_class_key *key) { return lock->key == key; } /* * Acquire a lock. * * Values for "read": * * 0: exclusive (write) acquire * 1: read-acquire (no recursion allowed) * 2: read-acquire with same-instance recursion allowed * * Values for check: * * 0: simple checks (freeing, held-at-exit-time, etc.) * 1: full validation */ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); extern void lock_release(struct lockdep_map *lock, unsigned long ip); extern void lock_sync(struct lockdep_map *lock, unsigned int subclass, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); /* lock_is_held_type() returns */ #define LOCK_STATE_UNKNOWN -1 #define LOCK_STATE_NOT_HELD 0 #define LOCK_STATE_HELD 1 /* * Same "read" as for lock_acquire(), except -1 means any. */ extern int lock_is_held_type(const struct lockdep_map *lock, int read); static inline int lock_is_held(const struct lockdep_map *lock) { return lock_is_held_type(lock, -1); } #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) #define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); #define lock_set_novalidate_class(l, n, i) \ lock_set_class(l, n, &__lockdep_no_validate__, 0, i) static inline void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, unsigned long ip) { lock_set_class(lock, lock->name, lock->key, subclass, ip); } extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) #define lockdep_assert(cond) \ do { WARN_ON(debug_locks && !(cond)); } while (0) #define lockdep_assert_once(cond) \ do { WARN_ON_ONCE(debug_locks && !(cond)); } while (0) #define lockdep_assert_held(l) \ lockdep_assert(lockdep_is_held(l) != LOCK_STATE_NOT_HELD) #define lockdep_assert_not_held(l) \ lockdep_assert(lockdep_is_held(l) != LOCK_STATE_HELD) #define lockdep_assert_held_write(l) \ lockdep_assert(lockdep_is_held_type(l, 0)) #define lockdep_assert_held_read(l) \ lockdep_assert(lockdep_is_held_type(l, 1)) #define lockdep_assert_held_once(l) \ lockdep_assert_once(lockdep_is_held(l) != LOCK_STATE_NOT_HELD) #define lockdep_assert_none_held_once() \ lockdep_assert_once(!current->lockdep_depth) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) #define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) /* * Must use lock_map_aquire_try() with override maps to avoid * lockdep thinking they participate in the block chain. */ #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map _name = { \ .name = #_name "-wait-type-override", \ .wait_type_inner = _wait_type, \ .lock_type = LD_LOCK_WAIT_OVERRIDE, } #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) { } static inline void lockdep_off(void) { } static inline void lockdep_on(void) { } static inline void lockdep_set_selftest_task(struct task_struct *task) { } # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, i) do { } while (0) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, key, s, i) do { (void)(key); } while (0) # define lock_set_novalidate_class(l, n, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_wait(lock, name, key, sub, inner) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) # define lockdep_set_class_and_name(lock, key, name) \ do { (void)(key); (void)(name); } while (0) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) #define lockdep_set_novalidate_class(lock) do { } while (0) #define lockdep_set_notrack_class(lock) do { } while (0) /* * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP * case since the result is not well defined and the caller should rather * #ifdef the call himself. */ # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) static inline void lockdep_register_key(struct lock_class_key *key) { } static inline void lockdep_unregister_key(struct lock_class_key *key) { } #define lockdep_depth(tsk) (0) /* * Dummy forward declarations, allow users to write less ifdef-y code * and depend on dead code elimination. */ extern int lock_is_held(const void *); extern int lockdep_is_held(const void *); #define lockdep_is_held_type(l, r) (1) #define lockdep_assert(c) do { } while (0) #define lockdep_assert_once(c) do { } while (0) #define lockdep_assert_held(l) do { (void)(l); } while (0) #define lockdep_assert_not_held(l) do { (void)(l); } while (0) #define lockdep_assert_held_write(l) do { (void)(l); } while (0) #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_assert_none_held_once() do { } while (0) #define lockdep_recursing(tsk) (0) #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map __maybe_unused _name = {} #endif /* !LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING void lockdep_set_lock_cmp_fn(struct lockdep_map *, lock_cmp_fn, lock_print_fn); #define lock_set_cmp_fn(lock, ...) lockdep_set_lock_cmp_fn(&(lock)->dep_map, __VA_ARGS__) #else #define lock_set_cmp_fn(lock, ...) do { } while (0) #endif enum xhlock_context_t { XHLOCK_HARD, XHLOCK_SOFT, XHLOCK_CTX_NR, }; /* * To initialize a lockdep_map statically use this macro. * Note that _name must not be NULL. */ #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_free_task(struct task_struct *task) {} #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ ({ \ int ____err = 0; \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ ____err = lock(_lock); \ } \ if (!____err) \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ ____err; \ }) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) #define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ lock(_lock) #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else static inline void print_irqtrace_events(struct task_struct *curr) { } #endif /* Variable used to make lockdep treat read_lock() as recursive in selftests */ #ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS extern unsigned int force_read_lock_recursive; #else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #define force_read_lock_recursive 0 #endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #ifdef CONFIG_LOCKDEP extern bool read_lock_is_recursive(void); #else /* CONFIG_LOCKDEP */ /* If !LOCKDEP, the value is meaningless */ #define read_lock_is_recursive() 0 #endif /* * For trivial one-depth nesting of a lock-class, the following * global define can be used. (Subsystems with multiple levels * of nesting should define their own lock-nesting subclasses.) */ #define SINGLE_DEPTH_NESTING 1 /* * Map the dependency ops to NOP or to real lockdep ops, depending * on the per lock-class debug mode: */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, i) lock_release(l, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwlock_acquire_read(l, s, t, i) \ do { \ if (read_lock_is_recursive()) \ lock_acquire_shared_recursive(l, s, t, NULL, i); \ else \ lock_acquire_shared(l, s, t, NULL, i); \ } while (0) #define rwlock_release(l, i) lock_release(l, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define seqcount_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define seqcount_release(l, i) lock_release(l, i) #define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define mutex_release(l, i) lock_release(l, i) #define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) #define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) #define lock_map_sync(l) lock_sync(l, 0, 0, 1, NULL, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_nested(lock, subclass) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL, \ _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); DECLARE_PER_CPU(unsigned int, lockdep_recursion); #define __lockdep_enabled (debug_locks && !this_cpu_read(lockdep_recursion)) #define lockdep_assert_irqs_enabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_irqs_disabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_in_irq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \ } while (0) #define lockdep_assert_no_hardirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && (this_cpu_read(hardirq_context) || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_enabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() != 0 || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_disabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() == 0 && \ this_cpu_read(hardirqs_enabled))); \ } while (0) /* * Acceptable for protecting per-CPU resources accessed from BH. * Much like in_softirq() - semantics are ambiguous, use carefully. */ #define lockdep_assert_in_softirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && \ (!in_softirq() || in_irq() || in_nmi())); \ } while (0) extern void lockdep_assert_in_softirq_func(void); #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define might_lock_nested(lock, subclass) do { } while (0) # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) # define lockdep_assert_no_hardirq() do { } while (0) # define lockdep_assert_preemption_enabled() do { } while (0) # define lockdep_assert_preemption_disabled() do { } while (0) # define lockdep_assert_in_softirq() do { } while (0) # define lockdep_assert_in_softirq_func() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ lockdep_hardirq_context() && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) #else # define lockdep_assert_RT_in_threaded_ctx() do { } while (0) #endif #ifdef CONFIG_LOCKDEP void lockdep_rcu_suspicious(const char *file, const int line, const char *s); #else static inline void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { } #endif #endif /* __LINUX_LOCKDEP_H */
3932 1708 2433 2628 1449 1265 2357 748 1 274 2510 2058 2100 3 29 29 351 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-fallback.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_FALLBACK_H #define _LINUX_ATOMIC_FALLBACK_H #include <linux/compiler.h> #if defined(arch_xchg) #define raw_xchg arch_xchg #elif defined(arch_xchg_relaxed) #define raw_xchg(...) \ __atomic_op_fence(arch_xchg, __VA_ARGS__) #else extern void raw_xchg_not_implemented(void); #define raw_xchg(...) raw_xchg_not_implemented() #endif #if defined(arch_xchg_acquire) #define raw_xchg_acquire arch_xchg_acquire #elif defined(arch_xchg_relaxed) #define raw_xchg_acquire(...) \ __atomic_op_acquire(arch_xchg, __VA_ARGS__) #elif defined(arch_xchg) #define raw_xchg_acquire arch_xchg #else extern void raw_xchg_acquire_not_implemented(void); #define raw_xchg_acquire(...) raw_xchg_acquire_not_implemented() #endif #if defined(arch_xchg_release) #define raw_xchg_release arch_xchg_release #elif defined(arch_xchg_relaxed) #define raw_xchg_release(...) \ __atomic_op_release(arch_xchg, __VA_ARGS__) #elif defined(arch_xchg) #define raw_xchg_release arch_xchg #else extern void raw_xchg_release_not_implemented(void); #define raw_xchg_release(...) raw_xchg_release_not_implemented() #endif #if defined(arch_xchg_relaxed) #define raw_xchg_relaxed arch_xchg_relaxed #elif defined(arch_xchg) #define raw_xchg_relaxed arch_xchg #else extern void raw_xchg_relaxed_not_implemented(void); #define raw_xchg_relaxed(...) raw_xchg_relaxed_not_implemented() #endif #if defined(arch_cmpxchg) #define raw_cmpxchg arch_cmpxchg #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg(...) \ __atomic_op_fence(arch_cmpxchg, __VA_ARGS__) #else extern void raw_cmpxchg_not_implemented(void); #define raw_cmpxchg(...) raw_cmpxchg_not_implemented() #endif #if defined(arch_cmpxchg_acquire) #define raw_cmpxchg_acquire arch_cmpxchg_acquire #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_acquire(...) \ __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__) #elif defined(arch_cmpxchg) #define raw_cmpxchg_acquire arch_cmpxchg #else extern void raw_cmpxchg_acquire_not_implemented(void); #define raw_cmpxchg_acquire(...) raw_cmpxchg_acquire_not_implemented() #endif #if defined(arch_cmpxchg_release) #define raw_cmpxchg_release arch_cmpxchg_release #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_release(...) \ __atomic_op_release(arch_cmpxchg, __VA_ARGS__) #elif defined(arch_cmpxchg) #define raw_cmpxchg_release arch_cmpxchg #else extern void raw_cmpxchg_release_not_implemented(void); #define raw_cmpxchg_release(...) raw_cmpxchg_release_not_implemented() #endif #if defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_relaxed arch_cmpxchg_relaxed #elif defined(arch_cmpxchg) #define raw_cmpxchg_relaxed arch_cmpxchg #else extern void raw_cmpxchg_relaxed_not_implemented(void); #define raw_cmpxchg_relaxed(...) raw_cmpxchg_relaxed_not_implemented() #endif #if defined(arch_cmpxchg64) #define raw_cmpxchg64 arch_cmpxchg64 #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64(...) \ __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__) #else extern void raw_cmpxchg64_not_implemented(void); #define raw_cmpxchg64(...) raw_cmpxchg64_not_implemented() #endif #if defined(arch_cmpxchg64_acquire) #define raw_cmpxchg64_acquire arch_cmpxchg64_acquire #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_acquire(...) \ __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__) #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_acquire arch_cmpxchg64 #else extern void raw_cmpxchg64_acquire_not_implemented(void); #define raw_cmpxchg64_acquire(...) raw_cmpxchg64_acquire_not_implemented() #endif #if defined(arch_cmpxchg64_release) #define raw_cmpxchg64_release arch_cmpxchg64_release #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_release(...) \ __atomic_op_release(arch_cmpxchg64, __VA_ARGS__) #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_release arch_cmpxchg64 #else extern void raw_cmpxchg64_release_not_implemented(void); #define raw_cmpxchg64_release(...) raw_cmpxchg64_release_not_implemented() #endif #if defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_relaxed arch_cmpxchg64_relaxed #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_relaxed arch_cmpxchg64 #else extern void raw_cmpxchg64_relaxed_not_implemented(void); #define raw_cmpxchg64_relaxed(...) raw_cmpxchg64_relaxed_not_implemented() #endif #if defined(arch_cmpxchg128) #define raw_cmpxchg128 arch_cmpxchg128 #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128(...) \ __atomic_op_fence(arch_cmpxchg128, __VA_ARGS__) #else extern void raw_cmpxchg128_not_implemented(void); #define raw_cmpxchg128(...) raw_cmpxchg128_not_implemented() #endif #if defined(arch_cmpxchg128_acquire) #define raw_cmpxchg128_acquire arch_cmpxchg128_acquire #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_acquire(...) \ __atomic_op_acquire(arch_cmpxchg128, __VA_ARGS__) #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_acquire arch_cmpxchg128 #else extern void raw_cmpxchg128_acquire_not_implemented(void); #define raw_cmpxchg128_acquire(...) raw_cmpxchg128_acquire_not_implemented() #endif #if defined(arch_cmpxchg128_release) #define raw_cmpxchg128_release arch_cmpxchg128_release #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_release(...) \ __atomic_op_release(arch_cmpxchg128, __VA_ARGS__) #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_release arch_cmpxchg128 #else extern void raw_cmpxchg128_release_not_implemented(void); #define raw_cmpxchg128_release(...) raw_cmpxchg128_release_not_implemented() #endif #if defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_relaxed arch_cmpxchg128_relaxed #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_relaxed arch_cmpxchg128 #else extern void raw_cmpxchg128_relaxed_not_implemented(void); #define raw_cmpxchg128_relaxed(...) raw_cmpxchg128_relaxed_not_implemented() #endif #if defined(arch_try_cmpxchg) #define raw_try_cmpxchg arch_try_cmpxchg #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg(...) \ __atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__) #else #define raw_try_cmpxchg(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_acquire) #define raw_try_cmpxchg_acquire arch_try_cmpxchg_acquire #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__) #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_acquire arch_try_cmpxchg #else #define raw_try_cmpxchg_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_release) #define raw_try_cmpxchg_release arch_try_cmpxchg_release #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_release(...) \ __atomic_op_release(arch_try_cmpxchg, __VA_ARGS__) #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_release arch_try_cmpxchg #else #define raw_try_cmpxchg_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_relaxed arch_try_cmpxchg_relaxed #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_relaxed arch_try_cmpxchg #else #define raw_try_cmpxchg_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64 arch_try_cmpxchg64 #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64(...) \ __atomic_op_fence(arch_try_cmpxchg64, __VA_ARGS__) #else #define raw_try_cmpxchg64(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_acquire) #define raw_try_cmpxchg64_acquire arch_try_cmpxchg64_acquire #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg64, __VA_ARGS__) #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_acquire arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_release) #define raw_try_cmpxchg64_release arch_try_cmpxchg64_release #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_release(...) \ __atomic_op_release(arch_try_cmpxchg64, __VA_ARGS__) #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_release arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64_relaxed #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128 arch_try_cmpxchg128 #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128(...) \ __atomic_op_fence(arch_try_cmpxchg128, __VA_ARGS__) #else #define raw_try_cmpxchg128(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_acquire) #define raw_try_cmpxchg128_acquire arch_try_cmpxchg128_acquire #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg128, __VA_ARGS__) #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_acquire arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_release) #define raw_try_cmpxchg128_release arch_try_cmpxchg128_release #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_release(...) \ __atomic_op_release(arch_try_cmpxchg128, __VA_ARGS__) #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_release arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128_relaxed #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg_local arch_cmpxchg_local #ifdef arch_try_cmpxchg_local #define raw_try_cmpxchg_local arch_try_cmpxchg_local #else #define raw_try_cmpxchg_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg64_local arch_cmpxchg64_local #ifdef arch_try_cmpxchg64_local #define raw_try_cmpxchg64_local arch_try_cmpxchg64_local #else #define raw_try_cmpxchg64_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg128_local arch_cmpxchg128_local #ifdef arch_try_cmpxchg128_local #define raw_try_cmpxchg128_local arch_try_cmpxchg128_local #else #define raw_try_cmpxchg128_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_sync_cmpxchg arch_sync_cmpxchg #ifdef arch_sync_try_cmpxchg #define raw_sync_try_cmpxchg arch_sync_try_cmpxchg #else #define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif /** * raw_atomic_read() - atomic load with relaxed ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline int raw_atomic_read(const atomic_t *v) { return arch_atomic_read(v); } /** * raw_atomic_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline int raw_atomic_read_acquire(const atomic_t *v) { #if defined(arch_atomic_read_acquire) return arch_atomic_read_acquire(v); #else int ret; if (__native_word(atomic_t)) { ret = smp_load_acquire(&(v)->counter); } else { ret = raw_atomic_read(v); __atomic_acquire_fence(); } return ret; #endif } /** * raw_atomic_set() - atomic set with relaxed ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_set(atomic_t *v, int i) { arch_atomic_set(v, i); } /** * raw_atomic_set_release() - atomic set with release ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_set_release(atomic_t *v, int i) { #if defined(arch_atomic_set_release) arch_atomic_set_release(v, i); #else if (__native_word(atomic_t)) { smp_store_release(&(v)->counter, i); } else { __atomic_release_fence(); raw_atomic_set(v, i); } #endif } /** * raw_atomic_add() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_add(int i, atomic_t *v) { arch_atomic_add(i, v); } /** * raw_atomic_add_return() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return(int i, atomic_t *v) { #if defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #elif defined(arch_atomic_add_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_add_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_add_return" #endif } /** * raw_atomic_add_return_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_acquire(int i, atomic_t *v) { #if defined(arch_atomic_add_return_acquire) return arch_atomic_add_return_acquire(i, v); #elif defined(arch_atomic_add_return_relaxed) int ret = arch_atomic_add_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_acquire" #endif } /** * raw_atomic_add_return_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_release(int i, atomic_t *v) { #if defined(arch_atomic_add_return_release) return arch_atomic_add_return_release(i, v); #elif defined(arch_atomic_add_return_relaxed) __atomic_release_fence(); return arch_atomic_add_return_relaxed(i, v); #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_release" #endif } /** * raw_atomic_add_return_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_add_return_relaxed) return arch_atomic_add_return_relaxed(i, v); #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_relaxed" #endif } /** * raw_atomic_fetch_add() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #elif defined(arch_atomic_fetch_add_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_add_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_add" #endif } /** * raw_atomic_fetch_add_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_acquire) return arch_atomic_fetch_add_acquire(i, v); #elif defined(arch_atomic_fetch_add_relaxed) int ret = arch_atomic_fetch_add_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_acquire" #endif } /** * raw_atomic_fetch_add_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_release) return arch_atomic_fetch_add_release(i, v); #elif defined(arch_atomic_fetch_add_relaxed) __atomic_release_fence(); return arch_atomic_fetch_add_relaxed(i, v); #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_release" #endif } /** * raw_atomic_fetch_add_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_relaxed) return arch_atomic_fetch_add_relaxed(i, v); #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_relaxed" #endif } /** * raw_atomic_sub() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_sub(int i, atomic_t *v) { arch_atomic_sub(i, v); } /** * raw_atomic_sub_return() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return(int i, atomic_t *v) { #if defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #elif defined(arch_atomic_sub_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_sub_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_sub_return" #endif } /** * raw_atomic_sub_return_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_acquire(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_acquire) return arch_atomic_sub_return_acquire(i, v); #elif defined(arch_atomic_sub_return_relaxed) int ret = arch_atomic_sub_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_acquire" #endif } /** * raw_atomic_sub_return_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_release(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_release) return arch_atomic_sub_return_release(i, v); #elif defined(arch_atomic_sub_return_relaxed) __atomic_release_fence(); return arch_atomic_sub_return_relaxed(i, v); #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_release" #endif } /** * raw_atomic_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_relaxed) return arch_atomic_sub_return_relaxed(i, v); #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_relaxed" #endif } /** * raw_atomic_fetch_sub() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_sub_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_sub" #endif } /** * raw_atomic_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_acquire) return arch_atomic_fetch_sub_acquire(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) int ret = arch_atomic_fetch_sub_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_acquire" #endif } /** * raw_atomic_fetch_sub_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_release) return arch_atomic_fetch_sub_release(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) __atomic_release_fence(); return arch_atomic_fetch_sub_relaxed(i, v); #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_release" #endif } /** * raw_atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_relaxed) return arch_atomic_fetch_sub_relaxed(i, v); #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_relaxed" #endif } /** * raw_atomic_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_inc(atomic_t *v) { #if defined(arch_atomic_inc) arch_atomic_inc(v); #else raw_atomic_add(1, v); #endif } /** * raw_atomic_inc_return() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return(atomic_t *v) { #if defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #elif defined(arch_atomic_inc_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_inc_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_add_return(1, v); #endif } /** * raw_atomic_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_acquire(atomic_t *v) { #if defined(arch_atomic_inc_return_acquire) return arch_atomic_inc_return_acquire(v); #elif defined(arch_atomic_inc_return_relaxed) int ret = arch_atomic_inc_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_acquire(1, v); #endif } /** * raw_atomic_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_release(atomic_t *v) { #if defined(arch_atomic_inc_return_release) return arch_atomic_inc_return_release(v); #elif defined(arch_atomic_inc_return_relaxed) __atomic_release_fence(); return arch_atomic_inc_return_relaxed(v); #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_release(1, v); #endif } /** * raw_atomic_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_relaxed(atomic_t *v) { #if defined(arch_atomic_inc_return_relaxed) return arch_atomic_inc_return_relaxed(v); #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_relaxed(1, v); #endif } /** * raw_atomic_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc(atomic_t *v) { #if defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #elif defined(arch_atomic_fetch_inc_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_inc_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_add(1, v); #endif } /** * raw_atomic_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_acquire(atomic_t *v) { #if defined(arch_atomic_fetch_inc_acquire) return arch_atomic_fetch_inc_acquire(v); #elif defined(arch_atomic_fetch_inc_relaxed) int ret = arch_atomic_fetch_inc_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_acquire(1, v); #endif } /** * raw_atomic_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_release(atomic_t *v) { #if defined(arch_atomic_fetch_inc_release) return arch_atomic_fetch_inc_release(v); #elif defined(arch_atomic_fetch_inc_relaxed) __atomic_release_fence(); return arch_atomic_fetch_inc_relaxed(v); #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_release(1, v); #endif } /** * raw_atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_relaxed(atomic_t *v) { #if defined(arch_atomic_fetch_inc_relaxed) return arch_atomic_fetch_inc_relaxed(v); #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_relaxed(1, v); #endif } /** * raw_atomic_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_dec(atomic_t *v) { #if defined(arch_atomic_dec) arch_atomic_dec(v); #else raw_atomic_sub(1, v); #endif } /** * raw_atomic_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return(atomic_t *v) { #if defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #elif defined(arch_atomic_dec_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_dec_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_sub_return(1, v); #endif } /** * raw_atomic_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_acquire(atomic_t *v) { #if defined(arch_atomic_dec_return_acquire) return arch_atomic_dec_return_acquire(v); #elif defined(arch_atomic_dec_return_relaxed) int ret = arch_atomic_dec_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_acquire(1, v); #endif } /** * raw_atomic_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_release(atomic_t *v) { #if defined(arch_atomic_dec_return_release) return arch_atomic_dec_return_release(v); #elif defined(arch_atomic_dec_return_relaxed) __atomic_release_fence(); return arch_atomic_dec_return_relaxed(v); #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_release(1, v); #endif } /** * raw_atomic_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_relaxed(atomic_t *v) { #if defined(arch_atomic_dec_return_relaxed) return arch_atomic_dec_return_relaxed(v); #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_relaxed(1, v); #endif } /** * raw_atomic_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec(atomic_t *v) { #if defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #elif defined(arch_atomic_fetch_dec_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_dec_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_sub(1, v); #endif } /** * raw_atomic_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_acquire(atomic_t *v) { #if defined(arch_atomic_fetch_dec_acquire) return arch_atomic_fetch_dec_acquire(v); #elif defined(arch_atomic_fetch_dec_relaxed) int ret = arch_atomic_fetch_dec_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_acquire(1, v); #endif } /** * raw_atomic_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_release(atomic_t *v) { #if defined(arch_atomic_fetch_dec_release) return arch_atomic_fetch_dec_release(v); #elif defined(arch_atomic_fetch_dec_relaxed) __atomic_release_fence(); return arch_atomic_fetch_dec_relaxed(v); #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_release(1, v); #endif } /** * raw_atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_relaxed(atomic_t *v) { #if defined(arch_atomic_fetch_dec_relaxed) return arch_atomic_fetch_dec_relaxed(v); #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_relaxed(1, v); #endif } /** * raw_atomic_and() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_and(int i, atomic_t *v) { arch_atomic_and(i, v); } /** * raw_atomic_fetch_and() - atomic bitwise AND with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #elif defined(arch_atomic_fetch_and_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_and_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_and" #endif } /** * raw_atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_acquire) return arch_atomic_fetch_and_acquire(i, v); #elif defined(arch_atomic_fetch_and_relaxed) int ret = arch_atomic_fetch_and_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_acquire" #endif } /** * raw_atomic_fetch_and_release() - atomic bitwise AND with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_release) return arch_atomic_fetch_and_release(i, v); #elif defined(arch_atomic_fetch_and_relaxed) __atomic_release_fence(); return arch_atomic_fetch_and_relaxed(i, v); #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_release" #endif } /** * raw_atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_relaxed) return arch_atomic_fetch_and_relaxed(i, v); #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_relaxed" #endif } /** * raw_atomic_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_andnot(int i, atomic_t *v) { #if defined(arch_atomic_andnot) arch_atomic_andnot(i, v); #else raw_atomic_and(~i, v); #endif } /** * raw_atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_andnot_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_and(~i, v); #endif } /** * raw_atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_acquire) return arch_atomic_fetch_andnot_acquire(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) int ret = arch_atomic_fetch_andnot_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_acquire(~i, v); #endif } /** * raw_atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_release) return arch_atomic_fetch_andnot_release(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) __atomic_release_fence(); return arch_atomic_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_release(~i, v); #endif } /** * raw_atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_relaxed) return arch_atomic_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_relaxed(~i, v); #endif } /** * raw_atomic_or() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_or(int i, atomic_t *v) { arch_atomic_or(i, v); } /** * raw_atomic_fetch_or() - atomic bitwise OR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #elif defined(arch_atomic_fetch_or_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_or_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_or" #endif } /** * raw_atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_acquire) return arch_atomic_fetch_or_acquire(i, v); #elif defined(arch_atomic_fetch_or_relaxed) int ret = arch_atomic_fetch_or_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_acquire" #endif } /** * raw_atomic_fetch_or_release() - atomic bitwise OR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_release) return arch_atomic_fetch_or_release(i, v); #elif defined(arch_atomic_fetch_or_relaxed) __atomic_release_fence(); return arch_atomic_fetch_or_relaxed(i, v); #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_release" #endif } /** * raw_atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_relaxed) return arch_atomic_fetch_or_relaxed(i, v); #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_relaxed" #endif } /** * raw_atomic_xor() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_xor(int i, atomic_t *v) { arch_atomic_xor(i, v); } /** * raw_atomic_fetch_xor() - atomic bitwise XOR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_xor_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_xor" #endif } /** * raw_atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_acquire) return arch_atomic_fetch_xor_acquire(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) int ret = arch_atomic_fetch_xor_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_acquire" #endif } /** * raw_atomic_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_release) return arch_atomic_fetch_xor_release(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) __atomic_release_fence(); return arch_atomic_fetch_xor_relaxed(i, v); #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_release" #endif } /** * raw_atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_relaxed) return arch_atomic_fetch_xor_relaxed(i, v); #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_relaxed" #endif } /** * raw_atomic_xchg() - atomic exchange with full ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg(atomic_t *v, int new) { #if defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #elif defined(arch_atomic_xchg_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_xchg_relaxed(v, new); __atomic_post_full_fence(); return ret; #else return raw_xchg(&v->counter, new); #endif } /** * raw_atomic_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_acquire(atomic_t *v, int new) { #if defined(arch_atomic_xchg_acquire) return arch_atomic_xchg_acquire(v, new); #elif defined(arch_atomic_xchg_relaxed) int ret = arch_atomic_xchg_relaxed(v, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_acquire(&v->counter, new); #endif } /** * raw_atomic_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_release(atomic_t *v, int new) { #if defined(arch_atomic_xchg_release) return arch_atomic_xchg_release(v, new); #elif defined(arch_atomic_xchg_relaxed) __atomic_release_fence(); return arch_atomic_xchg_relaxed(v, new); #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_release(&v->counter, new); #endif } /** * raw_atomic_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_relaxed(atomic_t *v, int new) { #if defined(arch_atomic_xchg_relaxed) return arch_atomic_xchg_relaxed(v, new); #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_relaxed(&v->counter, new); #endif } /** * raw_atomic_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else return raw_cmpxchg(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_acquire) return arch_atomic_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) int ret = arch_atomic_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_acquire(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_release(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_release) return arch_atomic_cmpxchg_release(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_release(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_relaxed) return arch_atomic_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_relaxed(&v->counter, old, new); #endif } /** * raw_atomic_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic_try_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else int r, o = *old; r = raw_atomic_cmpxchg(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_acquire) return arch_atomic_try_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_acquire(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_release) return arch_atomic_try_cmpxchg_release(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_release(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_relaxed) return arch_atomic_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_relaxed(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_sub_and_test(int i, atomic_t *v) { #if defined(arch_atomic_sub_and_test) return arch_atomic_sub_and_test(i, v); #else return raw_atomic_sub_return(i, v) == 0; #endif } /** * raw_atomic_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_dec_and_test(atomic_t *v) { #if defined(arch_atomic_dec_and_test) return arch_atomic_dec_and_test(v); #else return raw_atomic_dec_return(v) == 0; #endif } /** * raw_atomic_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_inc_and_test(atomic_t *v) { #if defined(arch_atomic_inc_and_test) return arch_atomic_inc_and_test(v); #else return raw_atomic_inc_return(v) == 0; #endif } /** * raw_atomic_add_negative() - atomic add and test if negative with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative(int i, atomic_t *v) { #if defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #elif defined(arch_atomic_add_negative_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic_add_negative_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic_add_return(i, v) < 0; #endif } /** * raw_atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_acquire(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_acquire) return arch_atomic_add_negative_acquire(i, v); #elif defined(arch_atomic_add_negative_relaxed) bool ret = arch_atomic_add_negative_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_acquire(i, v) < 0; #endif } /** * raw_atomic_add_negative_release() - atomic add and test if negative with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_release(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_release) return arch_atomic_add_negative_release(i, v); #elif defined(arch_atomic_add_negative_relaxed) __atomic_release_fence(); return arch_atomic_add_negative_relaxed(i, v); #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_release(i, v) < 0; #endif } /** * raw_atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_relaxed) return arch_atomic_add_negative_relaxed(i, v); #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_relaxed(i, v) < 0; #endif } /** * raw_atomic_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_unless(atomic_t *v, int a, int u) { #if defined(arch_atomic_fetch_add_unless) return arch_atomic_fetch_add_unless(v, a, u); #else int c = raw_atomic_read(v); do { if (unlikely(c == u)) break; } while (!raw_atomic_try_cmpxchg(v, &c, c + a)); return c; #endif } /** * raw_atomic_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_add_unless(atomic_t *v, int a, int u) { #if defined(arch_atomic_add_unless) return arch_atomic_add_unless(v, a, u); #else return raw_atomic_fetch_add_unless(v, a, u) != u; #endif } /** * raw_atomic_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_inc_not_zero(atomic_t *v) { #if defined(arch_atomic_inc_not_zero) return arch_atomic_inc_not_zero(v); #else return raw_atomic_add_unless(v, 1, 0); #endif } /** * raw_atomic_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_inc_unless_negative(atomic_t *v) { #if defined(arch_atomic_inc_unless_negative) return arch_atomic_inc_unless_negative(v); #else int c = raw_atomic_read(v); do { if (unlikely(c < 0)) return false; } while (!raw_atomic_try_cmpxchg(v, &c, c + 1)); return true; #endif } /** * raw_atomic_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_dec_unless_positive(atomic_t *v) { #if defined(arch_atomic_dec_unless_positive) return arch_atomic_dec_unless_positive(v); #else int c = raw_atomic_read(v); do { if (unlikely(c > 0)) return false; } while (!raw_atomic_try_cmpxchg(v, &c, c - 1)); return true; #endif } /** * raw_atomic_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline int raw_atomic_dec_if_positive(atomic_t *v) { #if defined(arch_atomic_dec_if_positive) return arch_atomic_dec_if_positive(v); #else int dec, c = raw_atomic_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; } while (!raw_atomic_try_cmpxchg(v, &c, dec)); return dec; #endif } #ifdef CONFIG_GENERIC_ATOMIC64 #include <asm-generic/atomic64.h> #endif /** * raw_atomic64_read() - atomic load with relaxed ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline s64 raw_atomic64_read(const atomic64_t *v) { return arch_atomic64_read(v); } /** * raw_atomic64_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline s64 raw_atomic64_read_acquire(const atomic64_t *v) { #if defined(arch_atomic64_read_acquire) return arch_atomic64_read_acquire(v); #else s64 ret; if (__native_word(atomic64_t)) { ret = smp_load_acquire(&(v)->counter); } else { ret = raw_atomic64_read(v); __atomic_acquire_fence(); } return ret; #endif } /** * raw_atomic64_set() - atomic set with relaxed ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_set(atomic64_t *v, s64 i) { arch_atomic64_set(v, i); } /** * raw_atomic64_set_release() - atomic set with release ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic64_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_set_release(atomic64_t *v, s64 i) { #if defined(arch_atomic64_set_release) arch_atomic64_set_release(v, i); #else if (__native_word(atomic64_t)) { smp_store_release(&(v)->counter, i); } else { __atomic_release_fence(); raw_atomic64_set(v, i); } #endif } /** * raw_atomic64_add() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_add(s64 i, atomic64_t *v) { arch_atomic64_add(i, v); } /** * raw_atomic64_add_return() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #elif defined(arch_atomic64_add_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_add_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_add_return" #endif } /** * raw_atomic64_add_return_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_acquire) return arch_atomic64_add_return_acquire(i, v); #elif defined(arch_atomic64_add_return_relaxed) s64 ret = arch_atomic64_add_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_acquire" #endif } /** * raw_atomic64_add_return_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_release) return arch_atomic64_add_return_release(i, v); #elif defined(arch_atomic64_add_return_relaxed) __atomic_release_fence(); return arch_atomic64_add_return_relaxed(i, v); #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_release" #endif } /** * raw_atomic64_add_return_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_relaxed) return arch_atomic64_add_return_relaxed(i, v); #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_relaxed" #endif } /** * raw_atomic64_fetch_add() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_add_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_add" #endif } /** * raw_atomic64_fetch_add_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_acquire) return arch_atomic64_fetch_add_acquire(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) s64 ret = arch_atomic64_fetch_add_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_acquire" #endif } /** * raw_atomic64_fetch_add_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_release) return arch_atomic64_fetch_add_release(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_add_relaxed(i, v); #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_release" #endif } /** * raw_atomic64_fetch_add_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_relaxed) return arch_atomic64_fetch_add_relaxed(i, v); #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_relaxed" #endif } /** * raw_atomic64_sub() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_sub(s64 i, atomic64_t *v) { arch_atomic64_sub(i, v); } /** * raw_atomic64_sub_return() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #elif defined(arch_atomic64_sub_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_sub_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_sub_return" #endif } /** * raw_atomic64_sub_return_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_acquire) return arch_atomic64_sub_return_acquire(i, v); #elif defined(arch_atomic64_sub_return_relaxed) s64 ret = arch_atomic64_sub_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_acquire" #endif } /** * raw_atomic64_sub_return_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_release) return arch_atomic64_sub_return_release(i, v); #elif defined(arch_atomic64_sub_return_relaxed) __atomic_release_fence(); return arch_atomic64_sub_return_relaxed(i, v); #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_release" #endif } /** * raw_atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_relaxed) return arch_atomic64_sub_return_relaxed(i, v); #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_relaxed" #endif } /** * raw_atomic64_fetch_sub() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_sub_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_sub" #endif } /** * raw_atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_acquire) return arch_atomic64_fetch_sub_acquire(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) s64 ret = arch_atomic64_fetch_sub_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_acquire" #endif } /** * raw_atomic64_fetch_sub_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_release) return arch_atomic64_fetch_sub_release(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_sub_relaxed(i, v); #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_release" #endif } /** * raw_atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_relaxed) return arch_atomic64_fetch_sub_relaxed(i, v); #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_relaxed" #endif } /** * raw_atomic64_inc() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_inc(atomic64_t *v) { #if defined(arch_atomic64_inc) arch_atomic64_inc(v); #else raw_atomic64_add(1, v); #endif } /** * raw_atomic64_inc_return() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return(atomic64_t *v) { #if defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #elif defined(arch_atomic64_inc_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_inc_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_add_return(1, v); #endif } /** * raw_atomic64_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_acquire(atomic64_t *v) { #if defined(arch_atomic64_inc_return_acquire) return arch_atomic64_inc_return_acquire(v); #elif defined(arch_atomic64_inc_return_relaxed) s64 ret = arch_atomic64_inc_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_acquire(1, v); #endif } /** * raw_atomic64_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_release(atomic64_t *v) { #if defined(arch_atomic64_inc_return_release) return arch_atomic64_inc_return_release(v); #elif defined(arch_atomic64_inc_return_relaxed) __atomic_release_fence(); return arch_atomic64_inc_return_relaxed(v); #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_release(1, v); #endif } /** * raw_atomic64_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_relaxed(atomic64_t *v) { #if defined(arch_atomic64_inc_return_relaxed) return arch_atomic64_inc_return_relaxed(v); #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_relaxed(1, v); #endif } /** * raw_atomic64_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #elif defined(arch_atomic64_fetch_inc_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_inc_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_add(1, v); #endif } /** * raw_atomic64_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_acquire(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_acquire) return arch_atomic64_fetch_inc_acquire(v); #elif defined(arch_atomic64_fetch_inc_relaxed) s64 ret = arch_atomic64_fetch_inc_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_acquire(1, v); #endif } /** * raw_atomic64_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_release(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_release) return arch_atomic64_fetch_inc_release(v); #elif defined(arch_atomic64_fetch_inc_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_inc_relaxed(v); #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_release(1, v); #endif } /** * raw_atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_relaxed(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_relaxed) return arch_atomic64_fetch_inc_relaxed(v); #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_relaxed(1, v); #endif } /** * raw_atomic64_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_dec(atomic64_t *v) { #if defined(arch_atomic64_dec) arch_atomic64_dec(v); #else raw_atomic64_sub(1, v); #endif } /** * raw_atomic64_dec_return() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return(atomic64_t *v) { #if defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #elif defined(arch_atomic64_dec_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_dec_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_sub_return(1, v); #endif } /** * raw_atomic64_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_acquire(atomic64_t *v) { #if defined(arch_atomic64_dec_return_acquire) return arch_atomic64_dec_return_acquire(v); #elif defined(arch_atomic64_dec_return_relaxed) s64 ret = arch_atomic64_dec_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_acquire(1, v); #endif } /** * raw_atomic64_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_release(atomic64_t *v) { #if defined(arch_atomic64_dec_return_release) return arch_atomic64_dec_return_release(v); #elif defined(arch_atomic64_dec_return_relaxed) __atomic_release_fence(); return arch_atomic64_dec_return_relaxed(v); #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_release(1, v); #endif } /** * raw_atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_relaxed(atomic64_t *v) { #if defined(arch_atomic64_dec_return_relaxed) return arch_atomic64_dec_return_relaxed(v); #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_relaxed(1, v); #endif } /** * raw_atomic64_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #elif defined(arch_atomic64_fetch_dec_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_dec_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_sub(1, v); #endif } /** * raw_atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_acquire(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_acquire) return arch_atomic64_fetch_dec_acquire(v); #elif defined(arch_atomic64_fetch_dec_relaxed) s64 ret = arch_atomic64_fetch_dec_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_acquire(1, v); #endif } /** * raw_atomic64_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_release(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_release) return arch_atomic64_fetch_dec_release(v); #elif defined(arch_atomic64_fetch_dec_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_dec_relaxed(v); #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_release(1, v); #endif } /** * raw_atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_relaxed(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_relaxed) return arch_atomic64_fetch_dec_relaxed(v); #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_relaxed(1, v); #endif } /** * raw_atomic64_and() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_and(s64 i, atomic64_t *v) { arch_atomic64_and(i, v); } /** * raw_atomic64_fetch_and() - atomic bitwise AND with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_and_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_and" #endif } /** * raw_atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_acquire) return arch_atomic64_fetch_and_acquire(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) s64 ret = arch_atomic64_fetch_and_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_acquire" #endif } /** * raw_atomic64_fetch_and_release() - atomic bitwise AND with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_release) return arch_atomic64_fetch_and_release(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_and_relaxed(i, v); #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_release" #endif } /** * raw_atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_relaxed) return arch_atomic64_fetch_and_relaxed(i, v); #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_relaxed" #endif } /** * raw_atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_andnot(s64 i, atomic64_t *v) { #if defined(arch_atomic64_andnot) arch_atomic64_andnot(i, v); #else raw_atomic64_and(~i, v); #endif } /** * raw_atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_andnot_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_and(~i, v); #endif } /** * raw_atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_acquire) return arch_atomic64_fetch_andnot_acquire(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_acquire(~i, v); #endif } /** * raw_atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_release) return arch_atomic64_fetch_andnot_release(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_release(~i, v); #endif } /** * raw_atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_relaxed) return arch_atomic64_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_relaxed(~i, v); #endif } /** * raw_atomic64_or() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_or(s64 i, atomic64_t *v) { arch_atomic64_or(i, v); } /** * raw_atomic64_fetch_or() - atomic bitwise OR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_or_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_or" #endif } /** * raw_atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_acquire) return arch_atomic64_fetch_or_acquire(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) s64 ret = arch_atomic64_fetch_or_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_acquire" #endif } /** * raw_atomic64_fetch_or_release() - atomic bitwise OR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_release) return arch_atomic64_fetch_or_release(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_or_relaxed(i, v); #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_release" #endif } /** * raw_atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_relaxed) return arch_atomic64_fetch_or_relaxed(i, v); #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_relaxed" #endif } /** * raw_atomic64_xor() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_xor(s64 i, atomic64_t *v) { arch_atomic64_xor(i, v); } /** * raw_atomic64_fetch_xor() - atomic bitwise XOR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_xor_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_xor" #endif } /** * raw_atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_acquire) return arch_atomic64_fetch_xor_acquire(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) s64 ret = arch_atomic64_fetch_xor_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_acquire" #endif } /** * raw_atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_release) return arch_atomic64_fetch_xor_release(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_xor_relaxed(i, v); #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_release" #endif } /** * raw_atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_relaxed) return arch_atomic64_fetch_xor_relaxed(i, v); #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_relaxed" #endif } /** * raw_atomic64_xchg() - atomic exchange with full ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic64_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #elif defined(arch_atomic64_xchg_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_xchg_relaxed(v, new); __atomic_post_full_fence(); return ret; #else return raw_xchg(&v->counter, new); #endif } /** * raw_atomic64_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_acquire(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_acquire) return arch_atomic64_xchg_acquire(v, new); #elif defined(arch_atomic64_xchg_relaxed) s64 ret = arch_atomic64_xchg_relaxed(v, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_acquire(&v->counter, new); #endif } /** * raw_atomic64_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_release(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_release) return arch_atomic64_xchg_release(v, new); #elif defined(arch_atomic64_xchg_relaxed) __atomic_release_fence(); return arch_atomic64_xchg_relaxed(v, new); #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_release(&v->counter, new); #endif } /** * raw_atomic64_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_relaxed) return arch_atomic64_xchg_relaxed(v, new); #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_relaxed(&v->counter, new); #endif } /** * raw_atomic64_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else return raw_cmpxchg(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_acquire) return arch_atomic64_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_acquire(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_release) return arch_atomic64_cmpxchg_release(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic64_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_release(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_relaxed) return arch_atomic64_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_relaxed(&v->counter, old, new); #endif } /** * raw_atomic64_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else s64 r, o = *old; r = raw_atomic64_cmpxchg(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_acquire) return arch_atomic64_try_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_acquire(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_release) return arch_atomic64_try_cmpxchg_release(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic64_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_release(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_relaxed) return arch_atomic64_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_relaxed(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_sub_and_test(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_and_test) return arch_atomic64_sub_and_test(i, v); #else return raw_atomic64_sub_return(i, v) == 0; #endif } /** * raw_atomic64_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_dec_and_test(atomic64_t *v) { #if defined(arch_atomic64_dec_and_test) return arch_atomic64_dec_and_test(v); #else return raw_atomic64_dec_return(v) == 0; #endif } /** * raw_atomic64_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_inc_and_test(atomic64_t *v) { #if defined(arch_atomic64_inc_and_test) return arch_atomic64_inc_and_test(v); #else return raw_atomic64_inc_return(v) == 0; #endif } /** * raw_atomic64_add_negative() - atomic add and test if negative with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #elif defined(arch_atomic64_add_negative_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic64_add_negative_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_add_return(i, v) < 0; #endif } /** * raw_atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_acquire) return arch_atomic64_add_negative_acquire(i, v); #elif defined(arch_atomic64_add_negative_relaxed) bool ret = arch_atomic64_add_negative_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_acquire(i, v) < 0; #endif } /** * raw_atomic64_add_negative_release() - atomic add and test if negative with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_release) return arch_atomic64_add_negative_release(i, v); #elif defined(arch_atomic64_add_negative_relaxed) __atomic_release_fence(); return arch_atomic64_add_negative_relaxed(i, v); #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_release(i, v) < 0; #endif } /** * raw_atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_relaxed) return arch_atomic64_add_negative_relaxed(i, v); #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_relaxed(i, v) < 0; #endif } /** * raw_atomic64_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { #if defined(arch_atomic64_fetch_add_unless) return arch_atomic64_fetch_add_unless(v, a, u); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c == u)) break; } while (!raw_atomic64_try_cmpxchg(v, &c, c + a)); return c; #endif } /** * raw_atomic64_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { #if defined(arch_atomic64_add_unless) return arch_atomic64_add_unless(v, a, u); #else return raw_atomic64_fetch_add_unless(v, a, u) != u; #endif } /** * raw_atomic64_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic64_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_inc_not_zero(atomic64_t *v) { #if defined(arch_atomic64_inc_not_zero) return arch_atomic64_inc_not_zero(v); #else return raw_atomic64_add_unless(v, 1, 0); #endif } /** * raw_atomic64_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic64_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_inc_unless_negative(atomic64_t *v) { #if defined(arch_atomic64_inc_unless_negative) return arch_atomic64_inc_unless_negative(v); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c < 0)) return false; } while (!raw_atomic64_try_cmpxchg(v, &c, c + 1)); return true; #endif } /** * raw_atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic64_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_dec_unless_positive(atomic64_t *v) { #if defined(arch_atomic64_dec_unless_positive) return arch_atomic64_dec_unless_positive(v); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c > 0)) return false; } while (!raw_atomic64_try_cmpxchg(v, &c, c - 1)); return true; #endif } /** * raw_atomic64_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic64_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline s64 raw_atomic64_dec_if_positive(atomic64_t *v) { #if defined(arch_atomic64_dec_if_positive) return arch_atomic64_dec_if_positive(v); #else s64 dec, c = raw_atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; } while (!raw_atomic64_try_cmpxchg(v, &c, dec)); return dec; #endif } #endif /* _LINUX_ATOMIC_FALLBACK_H */ // b565db590afeeff0d7c9485ccbca5bb6e155749f
88 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0 /* RTT/RTO calculation. * * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com) * * https://tools.ietf.org/html/rfc6298 * https://tools.ietf.org/html/rfc1122#section-4.2.3.1 * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf */ #include <linux/net.h> #include "ar-internal.h" #define RXRPC_RTO_MAX (120 * USEC_PER_SEC) #define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) { return 200; } static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) { return (peer->srtt_us >> 3) + peer->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) { return min(rto, RXRPC_RTO_MAX); } /* * Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ u32 srtt = peer->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (peer->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (peer->mdev_us >> 2); /* similar update on mdev */ } peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (peer->mdev_us > peer->mdev_max_us) { peer->mdev_max_us = peer->mdev_us; if (peer->mdev_max_us > peer->rttvar_us) peer->rttvar_us = peer->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); peer->mdev_max_us = peer->rttvar_us; } peer->srtt_us = max(1U, srtt); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ static void rxrpc_set_rto(struct rxrpc_peer *peer) { u32 rto; /* 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ rto = __rxrpc_set_rto(peer); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ peer->rto_us = rxrpc_bound_rto(rto); } static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) { if (rtt_us < 0) return; //rxrpc_update_rtt_min(peer, rtt_us); rxrpc_rtt_estimator(peer, rtt_us); rxrpc_set_rto(peer); /* RFC6298: only reset backoff on valid RTT measurement. */ peer->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has * exclusive access to the peer RTT data. */ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { struct rxrpc_peer *peer = call->peer; s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; spin_lock(&peer->rtt_input_lock); rxrpc_ack_update_rtt(peer, rtt_us); if (peer->rtt_count < 3) peer->rtt_count++; spin_unlock(&peer->rtt_input_lock); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, peer->srtt_us >> 3, peer->rto_us); } /* * Get the retransmission timeout to set in nanoseconds, backing it off each * time we retransmit. */ ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) { u64 timo_us; u32 backoff = READ_ONCE(peer->backoff); timo_us = peer->rto_us; timo_us <<= backoff; if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) WRITE_ONCE(peer->backoff, backoff + 1); if (timo_us < 1) timo_us = 1; return ns_to_ktime(timo_us * NSEC_PER_USEC); } void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) { peer->rto_us = RXRPC_TIMEOUT_INIT; peer->mdev_us = RXRPC_TIMEOUT_INIT; peer->backoff = 0; //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); }
181 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ERR_H #define _LINUX_ERR_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/errno.h> /* * Kernel pointers have redundant information, so we can use a * scheme where we can return either an error code or a normal * pointer with the same return value. * * This should be a per-architecture thing, to allow different * error and pointer decisions. */ #define MAX_ERRNO 4095 #ifndef __ASSEMBLY__ /** * IS_ERR_VALUE - Detect an error pointer. * @x: The pointer to check. * * Like IS_ERR(), but does not generate a compiler warning if result is unused. */ #define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO) /** * ERR_PTR - Create an error pointer. * @error: A negative error code. * * Encodes @error into a pointer value. Users should consider the result * opaque and not assume anything about how the error is encoded. * * Return: A pointer with @error encoded within its value. */ static inline void * __must_check ERR_PTR(long error) { return (void *) error; } /* Return the pointer in the percpu address space. */ #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) /** * PTR_ERR - Extract the error code from an error pointer. * @ptr: An error pointer. * Return: The error code within @ptr. */ static inline long __must_check PTR_ERR(__force const void *ptr) { return (long) ptr; } /* Read an error pointer from the percpu address space. */ #define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr))) /** * IS_ERR - Detect an error pointer. * @ptr: The pointer to check. * Return: true if @ptr is an error pointer, false otherwise. */ static inline bool __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } /* Read an error pointer from the percpu address space. */ #define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr))) /** * IS_ERR_OR_NULL - Detect an error pointer or a null pointer. * @ptr: The pointer to check. * * Like IS_ERR(), but also returns true for a null pointer. */ static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } /** * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type * @ptr: The pointer to cast. * * Explicitly cast an error-valued pointer to another pointer type in such a * way as to make it clear that's what's going on. */ static inline void * __must_check ERR_CAST(__force const void *ptr) { /* cast away the const */ return (void *) ptr; } /** * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one. * @ptr: A potential error pointer. * * Convenience function that can be used inside a function that returns * an error code to propagate errors received as error pointers. * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces: * * .. code-block:: c * * if (IS_ERR(ptr)) * return PTR_ERR(ptr); * else * return 0; * * Return: The error code within @ptr if it is an error pointer; 0 otherwise. */ static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) { if (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0; } #endif #endif /* _LINUX_ERR_H */
378 2103 2098 2097 2108 222 2097 2098 2107 2103 2108 2101 2047 2052 2052 2044 19 2050 2048 2048 19 2047 2049 2049 217 2107 221 20 20 206 203 2105 2097 2098 2110 2089 10 2103 64 2036 2110 2083 2110 2104 2110 2098 2110 2096 10 2085 59 2096 2098 3 7 7 7 2103 2082 59 65 2043 2110 2105 2093 2095 2097 2101 2102 236 2107 2101 2069 2081 2101 220 2100 2094 2107 2100 2097 2055 2058 2100 2102 28 2054 2049 2049 2058 2103 2106 2096 2104 2107 2108 216 2102 2100 2103 2062 2102 2057 2100 217 2100 216 2101 2098 2108 2100 562 2099 2100 236 5 5 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/printk.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Modified to make sys_syslog() more flexible: added commands to * return the last 4k of kernel messages, regardless of whether * they've been read or not. Added option to suppress kernel printk's * to the console. Added hook for sending the console messages * elsewhere, in preparation for a serial line console (someday). * Ted Ts'o, 2/11/93. * Modified for sysctl support, 1/8/97, Chris Horn. * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock * 01Mar01 Andrew Morton */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/console.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/nmi.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/syscore_ops.h> #include <linux/vmcore_info.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> #include <linux/cpu.h> #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/uaccess.h> #include <asm/sections.h> #include <trace/events/initcall.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> #include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); EXPORT_TRACEPOINT_SYMBOL_GPL(console); /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. */ int oops_in_progress; EXPORT_SYMBOL(oops_in_progress); /* * console_mutex protects console_list updates and console->flags updates. * The flags are synchronized only for consoles that are registered, i.e. * accessible via the console list. */ static DEFINE_MUTEX(console_mutex); /* * console_sem protects updates to console->seq * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem, 1); HLIST_HEAD(console_list); EXPORT_SYMBOL_GPL(console_list); DEFINE_STATIC_SRCU(console_srcu); /* * System may need to suppress printk message under certain * circumstances, like after kernel panic happens. */ int __read_mostly suppress_printk; #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; void lockdep_assert_console_list_lock_held(void) { lockdep_assert_held(&console_mutex); } EXPORT_SYMBOL(lockdep_assert_console_list_lock_held); #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC bool console_srcu_read_lock_is_held(void) { return srcu_read_lock_held(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock_is_held); #endif enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, __DEVKMSG_LOG_BIT_LOCK, }; enum devkmsg_log_masks { DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON), DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF), DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK), }; /* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */ #define DEVKMSG_LOG_MASK_DEFAULT 0 static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; static int __control_devkmsg(char *str) { size_t len; if (!str) return -EINVAL; len = str_has_prefix(str, "on"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_ON; return len; } len = str_has_prefix(str, "off"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_OFF; return len; } len = str_has_prefix(str, "ratelimit"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; return len; } return -EINVAL; } static int __init control_devkmsg(char *str) { if (__control_devkmsg(str) < 0) { pr_warn("printk.devkmsg: bad option string '%s'\n", str); return 1; } /* * Set sysctl string accordingly: */ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) strscpy(devkmsg_log_str, "on"); else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) strscpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* * Sysctl cannot change it anymore. The kernel command line setting of * this parameter is to force the setting to be permanent throughout the * runtime of the system. This is a precation measure against userspace * trying to be a smarta** and attempting to change it up on us. */ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; return 1; } __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; int err; if (write) { if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK) return -EINVAL; old = devkmsg_log; strscpy(old_str, devkmsg_log_str); } err = proc_dostring(table, write, buffer, lenp, ppos); if (err) return err; if (write) { err = __control_devkmsg(devkmsg_log_str); /* * Do not accept an unknown string OR a known string with * trailing crap... */ if (err < 0 || (err + 1 != *lenp)) { /* ... and restore old setting. */ devkmsg_log = old; strscpy(devkmsg_log_str, old_str); return -EINVAL; } } return 0; } #endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /** * console_list_lock - Lock the console list * * For console list or console->flags updates */ void console_list_lock(void) { /* * In unregister_console() and console_force_preferred_locked(), * synchronize_srcu() is called with the console_list_lock held. * Therefore it is not allowed that the console_list_lock is taken * with the srcu_lock held. * * Detecting if this context is really in the read-side critical * section is only possible if the appropriate debug options are * enabled. */ WARN_ON_ONCE(debug_lockdep_rcu_enabled() && srcu_read_lock_held(&console_srcu)); mutex_lock(&console_mutex); } EXPORT_SYMBOL(console_list_lock); /** * console_list_unlock - Unlock the console list * * Counterpart to console_list_lock() */ void console_list_unlock(void) { mutex_unlock(&console_mutex); } EXPORT_SYMBOL(console_list_unlock); /** * console_srcu_read_lock - Register a new reader for the * SRCU-protected console list * * Use for_each_console_srcu() to iterate the console list * * Context: Any context. * Return: A cookie to pass to console_srcu_read_unlock(). */ int console_srcu_read_lock(void) __acquires(&console_srcu) { return srcu_read_lock_nmisafe(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock); /** * console_srcu_read_unlock - Unregister an old reader from * the SRCU-protected console list * @cookie: cookie returned from console_srcu_read_lock() * * Counterpart to console_srcu_read_lock() */ void console_srcu_read_unlock(int cookie) __releases(&console_srcu) { srcu_read_unlock_nmisafe(&console_srcu, cookie); } EXPORT_SYMBOL(console_srcu_read_unlock); /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. */ #define down_console_sem() do { \ down(&console_sem);\ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ } while (0) static int __down_trylock_console_sem(unsigned long ip) { int lock_failed; unsigned long flags; /* * Here and in __up_console_sem() we need to be in safe mode, * because spindump/WARN/etc from under console ->lock will * deadlock in printk()->down_trylock_console_sem() otherwise. */ printk_safe_enter_irqsave(flags); lock_failed = down_trylock(&console_sem); printk_safe_exit_irqrestore(flags); if (lock_failed) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; } #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) static void __up_console_sem(unsigned long ip) { unsigned long flags; mutex_release(&console_lock_dep_map, ip); printk_safe_enter_irqsave(flags); up(&console_sem); printk_safe_exit_irqrestore(flags); } #define up_console_sem() __up_console_sem(_RET_IP_) static bool panic_in_progress(void) { return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); } /* Return true if a panic is in progress on the current CPU. */ bool this_cpu_in_panic(void) { /* * We can use raw_smp_processor_id() here because it is impossible for * the task to be migrated to the panic_cpu, or away from it. If * panic_cpu has already been set, and we're not currently executing on * that CPU, then we never will be. */ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); } /* * Return true if a panic is in progress on a remote CPU. * * On true, the local CPU should immediately release any printing resources * that may be needed by the panic CPU. */ bool other_cpu_in_panic(void) { return (panic_in_progress() && !this_cpu_in_panic()); } /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ * hold it and are racing, but it helps tracking those weird code * paths in the console code where we end up in places I want * locked without the console semaphore held). */ static int console_locked; /* * Array of consoles built from command line options (console=) */ #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; enum con_msg_format_flags { MSG_FORMAT_DEFAULT = 0, MSG_FORMAT_SYSLOG = (1 << 0), }; static int console_msg_format = MSG_FORMAT_DEFAULT; /* * The printk log buffer consists of a sequenced collection of records, each * containing variable length message text. Every record also contains its * own meta-data (@info). * * Every record meta-data carries the timestamp in microseconds, as well as * the standard userspace syslog level and syslog facility. The usual kernel * messages use LOG_KERN; userspace-injected messages always carry a matching * syslog facility, by default LOG_USER. The origin of every message can be * reliably determined that way. * * The human readable log message of a record is available in @text, the * length of the message text in @text_len. The stored message is not * terminated. * * Optionally, a record can carry a dictionary of properties (key/value * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier * b12:8 block dev_t * c127:3 char dev_t * n8 netdev ifindex * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * * Valid characters in property names are [a-zA-Z0-9.-_]. Property names * and values are terminated by a '\0' character. * * Example of record values: * record.text_buf = "it's a line" (unterminated) * record.info.seq = 56 * record.info.ts_nsec = 36863 * record.info.text_len = 11 * record.info.facility = 0 (LOG_KERN) * record.info.flags = 0 * record.info.level = 3 (LOG_ERR) * record.info.caller_id = 299 (task 299) * record.info.dev_info.subsystem = "pci" (terminated) * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) * * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * * /dev/kmsg exports the structured data in the following line format: * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n" * * Users of the export format should ignore possible additional values * separated by ',', and find the message after the ';' character. * * The optional key/value pairs are attached as continuation lines starting * with a space character and terminated by a newline. All possible * non-prinatable characters are escaped in the "\xff" notation. */ /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); /* * Specifies if a legacy console is registered. If legacy consoles are * present, it is necessary to perform the console lock/unlock dance * whenever console flushing should occur. */ bool have_legacy_console; /* * Specifies if an nbcon console is registered. If nbcon consoles are present, * synchronous printing of legacy consoles will not occur during panic until * the backtrace has been stored to the ringbuffer. */ bool have_nbcon_console; /* * Specifies if a boot console is registered. If boot consoles are present, * nbcon consoles cannot print simultaneously and must be synchronized by * the console lock. This is because boot consoles and nbcon consoles may * have mapped the same hardware. */ bool have_boot_console; /* See printk_legacy_allow_panic_sync() for details. */ bool legacy_allow_panic_sync; #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; /* True when _all_ printer threads are available for printing. */ bool printk_kthreads_running; struct latched_seq { seqcount_latch_t latch; u64 val[2]; }; /* * The next printk record to read after the last 'clear' command. There are * two copies (updated with seqcount_latch) so that reads can locklessly * access a valid value. Writers are synchronized by @syslog_lock. */ static struct latched_seq clear_seq = { .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), .val[0] = 0, .val[1] = 0, }; #define LOG_LEVEL(v) ((v) & 0x07) #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ #define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; /* * Define the average message size. This only affects the number of * descriptors that will be available. Underestimating is better than * overestimating (too many available descriptors is better than not enough). */ #define PRB_AVGBITS 5 /* 32 character average length */ #if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS #error CONFIG_LOG_BUF_SHIFT value too small. #endif _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); static struct printk_ringbuffer printk_rb_dynamic; struct printk_ringbuffer *prb = &printk_rb_static; /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. */ static bool __printk_percpu_data_ready __ro_after_init; bool printk_percpu_data_ready(void) { return __printk_percpu_data_ready; } /* Must be called under syslog_lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { write_seqcount_latch_begin(&ls->latch); ls->val[0] = val; write_seqcount_latch(&ls->latch); ls->val[1] = val; write_seqcount_latch_end(&ls->latch); } /* Can be called from any context. */ static u64 latched_seq_read_nolock(struct latched_seq *ls) { unsigned int seq; unsigned int idx; u64 val; do { seq = read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; } while (read_seqcount_latch_retry(&ls->latch, seq)); return val; } /* Return log buffer address */ char *log_buf_addr_get(void) { return log_buf; } /* Return log buffer size */ u32 log_buf_len_get(void) { return log_buf_len; } /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available * when the index points to the middle. */ #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = "<truncated>"; static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; if (*text_len > max_text_len) *text_len = max_text_len; /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); if (*text_len >= *trunc_msg_len) *text_len -= *trunc_msg_len; else *trunc_msg_len = 0; } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { if (dmesg_restrict) return 1; /* * Unless restricted, we allow "read all" and "get buffer size" * for everybody. */ return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; } static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've * already done the capabilities checks at open time. */ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN) goto ok; if (syslog_action_restricted(type)) { if (capable(CAP_SYSLOG)) goto ok; return -EPERM; } ok: return security_syslog(type); } static void append_char(char **pp, char *e, char c) { if (*pp < e) *(*pp)++ = c; } static ssize_t info_print_ext_header(char *buf, size_t size, struct printk_info *info) { u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); #else caller[0] = '\0'; #endif do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", (info->facility << 3) | info->level, info->seq, ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_add_ext_text(char *buf, size_t size, const char *text, size_t text_len, unsigned char endc) { char *p = buf, *e = buf + size; size_t i; /* escape non-printable characters */ for (i = 0; i < text_len; i++) { unsigned char c = text[i]; if (c < ' ' || c >= 127 || c == '\\') p += scnprintf(p, e - p, "\\x%02x", c); else append_char(&p, e, c); } append_char(&p, e, endc); return p - buf; } static ssize_t msg_add_dict_text(char *buf, size_t size, const char *key, const char *val) { size_t val_len = strlen(val); ssize_t len; if (!val_len) return 0; len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); return len; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *text, size_t text_len, struct dev_printk_info *dev_info) { ssize_t len; len = msg_add_ext_text(buf, size, text, text_len, '\n'); if (!dev_info) goto out; len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", dev_info->subsystem); len += msg_add_dict_text(buf + len, size - len, "DEVICE", dev_info->device); out: return len; } /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { atomic64_t seq; struct ratelimit_state rs; struct mutex lock; struct printk_buffers pbufs; }; static __printf(3, 4) __cold int devkmsg_emit(int facility, int level, const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; } static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ struct file *file = iocb->ki_filp; struct devkmsg_user *user = file->private_data; size_t len = iov_iter_count(from); ssize_t ret = len; if (len > PRINTKRB_RECORD_MAX) return -EINVAL; /* Ignore when user logging is disabled. */ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return len; /* Ratelimit when not explicitly enabled. */ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) { if (!___ratelimit(&user->rs, current->comm)) return ret; } buf = kmalloc(len+1, GFP_KERNEL); if (buf == NULL) return -ENOMEM; buf[len] = '\0'; if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } /* * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace * the decimal value represents 32bit, the lower 3 bit are the log * level, the rest are the log facility. * * If no prefix or no userspace facility is specified, we * enforce LOG_USER, to be able to reliably distinguish * kernel-generated messages from userspace-injected ones. */ line = buf; if (line[0] == '<') { char *endp = NULL; unsigned int u; u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { level = LOG_LEVEL(u); if (LOG_FACILITY(u) != 0) facility = LOG_FACILITY(u); endp++; line = endp; } } devkmsg_emit(facility, level, "%s", line); kfree(buf); return ret; } static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; char *outbuf = &user->pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &user->pbufs, }; ssize_t ret; ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; goto out; } /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ ret = wait_event_interruptible(log_wait, printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } if (pmsg.dropped) { /* our last seen message is gone, return error and reset */ atomic64_set(&user->seq, pmsg.seq); ret = -EPIPE; goto out; } atomic64_set(&user->seq, pmsg.seq + 1); if (pmsg.outbuf_len > count) { ret = -EINVAL; goto out; } if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) { ret = -EFAULT; goto out; } ret = pmsg.outbuf_len; out: mutex_unlock(&user->lock); return ret; } /* * Be careful when modifying this function!!! * * Only few operations are supported because the device works only with the * entire variable length messages (records). Non-standard values are * returned in the other cases and has been this way for quite some time. * User space applications might depend on this behavior. */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; loff_t ret = 0; if (offset) return -ESPIPE; switch (whence) { case SEEK_SET: /* the first record */ atomic64_set(&user->seq, prb_first_valid_seq(prb)); break; case SEEK_DATA: /* * The first record after the last SYSLOG_ACTION_CLEAR, * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); break; case SEEK_END: /* after the last record */ atomic64_set(&user->seq, prb_next_seq(prb)); break; default: ret = -EINVAL; } return ret; } static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; struct printk_info info; __poll_t ret = 0; poll_wait(file, &log_wait, wait); if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ if (info.seq != atomic64_read(&user->seq)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; } return ret; } static int devkmsg_open(struct inode *inode, struct file *file) { struct devkmsg_user *user; int err; if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return -EPERM; /* write-only does not need any file context */ if ((file->f_flags & O_ACCMODE) != O_WRONLY) { err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, SYSLOG_FROM_READER); if (err) return err; } user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); if (!user) return -ENOMEM; ratelimit_default_init(&user->rs); ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE); mutex_init(&user->lock); atomic64_set(&user->seq, prb_first_valid_seq(prb)); file->private_data = user; return 0; } static int devkmsg_release(struct inode *inode, struct file *file) { struct devkmsg_user *user = file->private_data; ratelimit_state_exit(&user->rs); mutex_destroy(&user->lock); kvfree(user); return 0; } const struct file_operations kmsg_fops = { .open = devkmsg_open, .read = devkmsg_read, .write_iter = devkmsg_write, .llseek = devkmsg_llseek, .poll = devkmsg_poll, .release = devkmsg_release, }; #ifdef CONFIG_VMCORE_INFO /* * This appends the listed symbols to /proc/vmcore * * /proc/vmcore is used by various utilities, like crash and makedumpfile to * obtain access to symbols that are otherwise very difficult to locate. These * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ void log_buf_vmcoreinfo_setup(void) { struct dev_printk_info *dev_info = NULL; VMCOREINFO_SYMBOL(prb); VMCOREINFO_SYMBOL(printk_rb_static); VMCOREINFO_SYMBOL(clear_seq); /* * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); VMCOREINFO_OFFSET(printk_ringbuffer, fail); VMCOREINFO_STRUCT_SIZE(prb_desc_ring); VMCOREINFO_OFFSET(prb_desc_ring, count_bits); VMCOREINFO_OFFSET(prb_desc_ring, descs); VMCOREINFO_OFFSET(prb_desc_ring, infos); VMCOREINFO_OFFSET(prb_desc_ring, head_id); VMCOREINFO_OFFSET(prb_desc_ring, tail_id); VMCOREINFO_STRUCT_SIZE(prb_desc); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); VMCOREINFO_OFFSET(prb_data_blk_lpos, next); VMCOREINFO_STRUCT_SIZE(printk_info); VMCOREINFO_OFFSET(printk_info, seq); VMCOREINFO_OFFSET(printk_info, ts_nsec); VMCOREINFO_OFFSET(printk_info, text_len); VMCOREINFO_OFFSET(printk_info, caller_id); VMCOREINFO_OFFSET(printk_info, dev_info); VMCOREINFO_STRUCT_SIZE(dev_printk_info); VMCOREINFO_OFFSET(dev_printk_info, subsystem); VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); VMCOREINFO_OFFSET(dev_printk_info, device); VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); VMCOREINFO_STRUCT_SIZE(prb_data_ring); VMCOREINFO_OFFSET(prb_data_ring, size_bits); VMCOREINFO_OFFSET(prb_data_ring, data); VMCOREINFO_OFFSET(prb_data_ring, head_lpos); VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); VMCOREINFO_SIZE(atomic_long_t); VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); VMCOREINFO_STRUCT_SIZE(latched_seq); VMCOREINFO_OFFSET(latched_seq, val); } #endif /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ static void __init log_buf_len_update(u64 size) { if (size > (u64)LOG_BUF_LEN_MAX) { size = (u64)LOG_BUF_LEN_MAX; pr_err("log_buf over 2G is not supported.\n"); } if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { u64 size; if (!str) return -EINVAL; size = memparse(str, &str); log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); #ifdef CONFIG_SMP #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static void __init log_buf_add_cpu(void) { unsigned int cpu_extra; /* * archs should set up cpu_possible_bits properly with * set_cpu_possible() after setup_arch() but just in * case lets ensure this is valid. */ if (num_possible_cpus() == 1) return; cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; /* by default this will only continue through for large > 64 CPUs */ if (cpu_extra <= __LOG_BUF_LEN / 2) return; pr_info("log_buf_len individual max cpu contribution: %d bytes\n", __LOG_CPU_MAX_BUF_LEN); pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", cpu_extra); pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); log_buf_len_update(cpu_extra + __LOG_BUF_LEN); } #else /* !CONFIG_SMP */ static inline void log_buf_add_cpu(void) {} #endif /* CONFIG_SMP */ static void __init set_percpu_data_ready(void) { __printk_percpu_data_ready = true; } static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_reserved_entry e; struct printk_record dest_r; prb_rec_init_wr(&dest_r, r->info->text_len); if (!prb_reserve(&e, rb, &dest_r)) return 0; memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); dest_r.info->text_len = r->info->text_len; dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); return prb_record_text_space(&e); } static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata; static void print_log_buf_usage_stats(void) { unsigned int descs_count = log_buf_len >> PRB_AVGBITS; size_t meta_data_size; meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info)); pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n", log_buf_len, meta_data_size, log_buf_len + meta_data_size); } void __init setup_log_buf(int early) { struct printk_info *new_infos; unsigned int new_descs_count; struct prb_desc *new_descs; struct printk_info info; struct printk_record r; unsigned int text_size; size_t new_descs_size; size_t new_infos_size; unsigned long flags; char *new_log_buf; unsigned int free; u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very * early, e.g. from setup_arch(), and second - when percpu_areas * are initialised. */ if (!early) set_percpu_data_ready(); if (log_buf != __log_buf) return; if (!early && !new_log_buf_len) log_buf_add_cpu(); if (!new_log_buf_len) { /* Show the memory stats only once. */ if (!early) goto out; return; } new_descs_count = new_log_buf_len >> PRB_AVGBITS; if (new_descs_count == 0) { pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); goto out; } new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu text bytes not available\n", new_log_buf_len); goto out; } new_descs_size = new_descs_count * sizeof(struct prb_desc); new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); goto err_free_log_buf; } new_infos_size = new_descs_count * sizeof(struct printk_info); new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); if (unlikely(!new_infos)) { pr_err("log_buf_len: %zu info bytes not available\n", new_infos_size); goto err_free_descs; } prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), new_descs, ilog2(new_descs_count), new_infos); local_irq_save(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN; prb_for_each_record(0, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } prb = &printk_rb_dynamic; local_irq_restore(flags); /* * Copy any remaining messages that might have appeared from * NMI context after copying but before switching to the * dynamic buffer. */ prb_for_each_record(seq, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", prb_next_seq(&printk_rb_static) - seq); } print_log_buf_usage_stats(); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); return; err_free_descs: memblock_free(new_descs, new_descs_size); err_free_log_buf: memblock_free(new_log_buf, new_log_buf_len); out: print_log_buf_usage_stats(); } static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; } early_param("ignore_loglevel", ignore_loglevel_setup); module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); static bool suppress_message_printing(int level) { return (level >= console_loglevel && !ignore_loglevel); } #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; get_option(&str, &boot_delay); if (boot_delay > 10 * 1000) boot_delay = 0; pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " "HZ: %d, loops_per_msec: %llu\n", boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 0; } early_param("boot_delay", boot_delay_setup); static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; bool suppress = !is_printk_force_console() && suppress_message_printing(level); if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress) return; k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { k--; cpu_relax(); /* * use (volatile) jiffies to prevent * compiler reduction; loop termination via jiffies * is secondary and may or may not happen. */ if (time_after(jiffies, timeout)) break; touch_nmi_watchdog(); } } #else static inline void boot_delay_msec(int level) { } #endif static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_syslog(unsigned int level, char *buf) { return sprintf(buf, "<%u>", level); } static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } #ifdef CONFIG_PRINTK_CALLER static size_t print_caller(u32 id, char *buf) { char caller[12]; snprintf(caller, sizeof(caller), "%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); return sprintf(buf, "[%6s]", caller); } #else #define print_caller(id, buf) 0 #endif static size_t info_print_prefix(const struct printk_info *info, bool syslog, bool time, char *buf) { size_t len = 0; if (syslog) len = print_syslog((info->facility << 3) | info->level, buf); if (time) len += print_time(info->ts_nsec, buf + len); len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; buf[len] = '\0'; } return len; } /* * Prepare the record for printing. The text is shifted within the given * buffer to avoid a need for another one. The following operations are * done: * * - Add prefix for each line. * - Drop truncated lines that no longer fit into the buffer. * - Add the trailing newline that has been removed in vprintk_store(). * - Add a string terminator. * * Since the produced string is always terminated, the maximum possible * return value is @r->text_buf_size - 1; * * Return: The length of the updated/prepared text, including the added * prefixes and the newline. The terminator is not counted. The dropped * line(s) are not counted. */ static size_t record_print_text(struct printk_record *r, bool syslog, bool time) { size_t text_len = r->info->text_len; size_t buf_size = r->text_buf_size; char *text = r->text_buf; char prefix[PRINTK_PREFIX_MAX]; bool truncated = false; size_t prefix_len; size_t line_len; size_t len = 0; char *next; /* * If the message was truncated because the buffer was not large * enough, treat the available text as if it were the full text. */ if (text_len > buf_size) text_len = buf_size; prefix_len = info_print_prefix(r->info, syslog, time, prefix); /* * @text_len: bytes of unprocessed text * @line_len: bytes of current line _without_ newline * @text: pointer to beginning of current line * @len: number of bytes prepared in r->text_buf */ for (;;) { next = memchr(text, '\n', text_len); if (next) { line_len = next - text; } else { /* Drop truncated line(s). */ if (truncated) break; line_len = text_len; } /* * Truncate the text if there is not enough space to add the * prefix and a trailing newline and a terminator. */ if (len + prefix_len + text_len + 1 + 1 > buf_size) { /* Drop even the current line if no space. */ if (len + prefix_len + line_len + 1 + 1 > buf_size) break; text_len = buf_size - len - prefix_len - 1 - 1; truncated = true; } memmove(text + prefix_len, text, text_len); memcpy(text, prefix, prefix_len); /* * Increment the prepared length to include the text and * prefix that were just moved+copied. Also increment for the * newline at the end of this line. If this is the last line, * there is no newline, but it will be added immediately below. */ len += prefix_len + line_len + 1; if (text_len == line_len) { /* * This is the last line. Add the trailing newline * removed in vprintk_store(). */ text[prefix_len + line_len] = '\n'; break; } /* * Advance beyond the added prefix and the related line with * its newline. */ text += prefix_len + line_len + 1; /* * The remaining text has only decreased by the line with its * newline. * * Note that @text_len can become zero. It happens when @text * ended with a newline (either due to truncation or the * original string ending with "\n\n"). The loop is correctly * repeated and (if not truncated) an empty line with a prefix * will be prepared. */ text_len -= line_len + 1; } /* * If a buffer was provided, it will be terminated. Space for the * string terminator is guaranteed to be available. The terminator is * not counted in the return value. */ if (buf_size > 0) r->text_buf[len] = 0; return len; } static size_t get_record_print_text_size(struct printk_info *info, unsigned int line_count, bool syslog, bool time) { char prefix[PRINTK_PREFIX_MAX]; size_t prefix_len; prefix_len = info_print_prefix(info, syslog, time, prefix); /* * Each line will be preceded with a prefix. The intermediate * newlines are already within the text, but a final trailing * newline will be added. */ return ((prefix_len * line_count) + info->text_len + 1); } /* * Beginning with @start_seq, find the first record where it and all following * records up to (but not including) @max_seq fit into @size. * * @max_seq is simply an upper bound and does not need to exist. If the caller * does not require an upper bound, -1 can be used for @max_seq. */ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, bool syslog, bool time) { struct printk_info info; unsigned int line_count; size_t len = 0; u64 seq; /* Determine the size of the records up to @max_seq. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (info.seq >= max_seq) break; len += get_record_print_text_size(&info, line_count, syslog, time); } /* * Adjust the upper bound for the next loop to avoid subtracting * lengths that were never added. */ if (seq < max_seq) max_seq = seq; /* * Move first record forward until length fits into the buffer. Ignore * newest messages that were not counted in the above cycle. Messages * might appear and get lost in the meantime. This is a best effort * that prevents an infinite loop that could occur with a retry. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (len <= size || info.seq >= max_seq) break; len -= get_record_print_text_size(&info, line_count, syslog, time); } return seq; } /* The caller is responsible for making sure @size is greater than 0. */ static int syslog_print(char __user *buf, int size) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); mutex_lock(&syslog_lock); /* * Wait for the @syslog_seq record to be available. @syslog_seq may * change while waiting. */ do { seq = syslog_seq; mutex_unlock(&syslog_lock); /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) goto out; } while (syslog_seq != seq); /* * Copy records that fit into the buffer. The above cycle makes sure * that the first record is always available. */ do { size_t n; size_t skip; int err; if (!prb_read_valid(prb, syslog_seq, &r)) break; if (r.info->seq != syslog_seq) { /* message is gone, move to next valid one */ syslog_seq = r.info->seq; syslog_partial = 0; } /* * To keep reading/counting partial line consistent, * use printk_time value as of the beginning of a line. */ if (!syslog_partial) syslog_time = printk_time; skip = syslog_partial; n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ /* partial read(), remember position */ n = size; syslog_partial += n; } else n = 0; if (!n) break; mutex_unlock(&syslog_lock); err = copy_to_user(buf, text + skip, n); mutex_lock(&syslog_lock); if (err) { if (!len) len = -EFAULT; break; } len += n; size -= n; buf += n; } while (size); out: mutex_unlock(&syslog_lock); kfree(text); return len; } static int syslog_print_all(char __user *buf, int size, bool clear) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; bool time; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; time = printk_time; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, size, true, time); prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); prb_for_each_record(seq, prb, seq, &r) { int textlen; textlen = record_print_text(&r, true, time); if (len + textlen > size) { seq--; break; } if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; if (len < 0) break; } if (clear) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, seq); mutex_unlock(&syslog_lock); } kfree(text); return len; } static void syslog_clear(void) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); mutex_unlock(&syslog_lock); } int do_syslog(int type, char __user *buf, int len, int source) { struct printk_info info; bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, source); if (error) return error; switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ break; case SYSLOG_ACTION_OPEN: /* Open log */ break; case SYSLOG_ACTION_READ: /* Read from log */ if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_clear(); break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: if (len < 1 || len > 8) return -EINVAL; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = LOGLEVEL_DEFAULT; break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: mutex_lock(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ mutex_unlock(&syslog_lock); return 0; } if (info.seq != syslog_seq) { /* messages are gone, move to first one */ syslog_seq = info.seq; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { /* * Short-cut for poll(/"proc/kmsg") which simply checks * for pending data, not the size; return the count of * records, not the length. */ error = prb_next_seq(prb) - syslog_seq; } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { error += get_record_print_text_size(&info, line_count, true, time); time = printk_time; } error -= syslog_partial; } mutex_unlock(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: error = -EINVAL; break; } return error; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* * Special console_lock variants that help to reduce the risk of soft-lockups. * They allow to pass console_lock to another printk() call using a busy wait. */ #ifdef CONFIG_LOCKDEP static struct lockdep_map console_owner_dep_map = { .name = "console_owner" }; #endif static DEFINE_RAW_SPINLOCK(console_owner_lock); static struct task_struct *console_owner; static bool console_waiter; /** * console_lock_spinning_enable - mark beginning of code where another * thread might safely busy wait * * This basically converts console_lock into a spinlock. This marks * the section where the console_lock owner can not sleep, because * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */ void console_lock_spinning_enable(void) { /* * Do not use spinning in panic(). The panic CPU wants to keep the lock. * Non-panic CPUs abandon the flush anyway. * * Just keep the lockdep annotation. The panic-CPU should avoid * taking console_owner_lock because it might cause a deadlock. * This looks like the easiest way how to prevent false lockdep * reports without handling races a lockless way. */ if (panic_in_progress()) goto lockdep; raw_spin_lock(&console_owner_lock); console_owner = current; raw_spin_unlock(&console_owner_lock); lockdep: /* The waiter may spin on us after setting console_owner */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); } /** * console_lock_spinning_disable_and_check - mark end of code where another * thread was able to busy wait and check if there is a waiter * @cookie: cookie returned from console_srcu_read_lock() * * This is called at the end of the section where spinning is allowed. * It has two functions. First, it is a signal that it is no longer * safe to start busy waiting for the lock. Second, it checks if * there is a busy waiter and passes the lock rights to her. * * Important: Callers lose both the console_lock and the SRCU read lock if * there was a busy waiter. They must not touch items synchronized by * console_lock or SRCU read lock in this case. * * Return: 1 if the lock rights were passed, 0 otherwise. */ int console_lock_spinning_disable_and_check(int cookie) { int waiter; /* * Ignore spinning waiters during panic() because they might get stopped * or blocked at any time, * * It is safe because nobody is allowed to start spinning during panic * in the first place. If there has been a waiter then non panic CPUs * might stay spinning. They would get stopped anyway. The panic context * will never start spinning and an interrupted spin on panic CPU will * never continue. */ if (panic_in_progress()) { /* Keep lockdep happy. */ spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } raw_spin_lock(&console_owner_lock); waiter = READ_ONCE(console_waiter); console_owner = NULL; raw_spin_unlock(&console_owner_lock); if (!waiter) { spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } /* The waiter is now free to continue */ WRITE_ONCE(console_waiter, false); spin_release(&console_owner_dep_map, _THIS_IP_); /* * Preserve lockdep lock ordering. Release the SRCU read lock before * releasing the console_lock. */ console_srcu_read_unlock(cookie); /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ mutex_release(&console_lock_dep_map, _THIS_IP_); return 1; } /** * console_trylock_spinning - try to get console_lock by busy waiting * * This allows to busy wait for the console_lock when the current * owner is running in specially marked sections. It means that * the current owner is running and cannot reschedule until it * is ready to lose the lock. * * Return: 1 if we got the lock, 0 othrewise */ static int console_trylock_spinning(void) { struct task_struct *owner = NULL; bool waiter; bool spin = false; unsigned long flags; if (console_trylock()) return 1; /* * It's unsafe to spin once a panic has begun. If we are the * panic CPU, we may have already halted the owner of the * console_sem. If we are not the panic CPU, then we should * avoid taking console_sem, so the panic CPU has a better * chance of cleanly acquiring it later. */ if (panic_in_progress()) return 0; printk_safe_enter_irqsave(flags); raw_spin_lock(&console_owner_lock); owner = READ_ONCE(console_owner); waiter = READ_ONCE(console_waiter); if (!waiter && owner && owner != current) { WRITE_ONCE(console_waiter, true); spin = true; } raw_spin_unlock(&console_owner_lock); /* * If there is an active printk() writing to the * consoles, instead of having it write our data too, * see if we can offload that load from the active * printer, and do some printing ourselves. * Go into a spin only if there isn't already a waiter * spinning, and there is an active printer, and * that active printer isn't us (recursive printk?). */ if (!spin) { printk_safe_exit_irqrestore(flags); return 0; } /* We spin waiting for the owner to release us */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); /* Owner will clear console_waiter on hand off */ while (READ_ONCE(console_waiter)) cpu_relax(); spin_release(&console_owner_dep_map, _THIS_IP_); printk_safe_exit_irqrestore(flags); /* * The owner passed the console lock to us. * Since we did not spin on console lock, annotate * this as a trylock. Otherwise lockdep will * complain. */ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); /* * Update @console_may_schedule for trylock because the previous * owner may have been schedulable. */ console_may_schedule = 0; return 1; } /* * Recursion is tracked separately on each CPU. If NMIs are supported, an * additional NMI context per CPU is also separately tracked. Until per-CPU * is available, a separate "early tracking" is performed. */ static DEFINE_PER_CPU(u8, printk_count); static u8 printk_count_early; #ifdef CONFIG_HAVE_NMI static DEFINE_PER_CPU(u8, printk_count_nmi); static u8 printk_count_nmi_early; #endif /* * Recursion is limited to keep the output sane. printk() should not require * more than 1 level of recursion (allowing, for example, printk() to trigger * a WARN), but a higher value is used in case some printk-internal errors * exist, such as the ringbuffer validation checks failing. */ #define PRINTK_MAX_RECURSION 3 /* * Return a pointer to the dedicated counter for the CPU+context of the * caller. */ static u8 *__printk_recursion_counter(void) { #ifdef CONFIG_HAVE_NMI if (in_nmi()) { if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count_nmi); return &printk_count_nmi_early; } #endif if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count); return &printk_count_early; } /* * Enter recursion tracking. Interrupts are disabled to simplify tracking. * The caller must check the boolean return value to see if the recursion is * allowed. On failure, interrupts are not disabled. * * @recursion_ptr must be a variable of type (u8 *) and is the same variable * that is passed to printk_exit_irqrestore(). */ #define printk_enter_irqsave(recursion_ptr, flags) \ ({ \ bool success = true; \ \ typecheck(u8 *, recursion_ptr); \ local_irq_save(flags); \ (recursion_ptr) = __printk_recursion_counter(); \ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \ local_irq_restore(flags); \ success = false; \ } else { \ (*(recursion_ptr))++; \ } \ success; \ }) /* Exit recursion tracking, restoring interrupts. */ #define printk_exit_irqrestore(recursion_ptr, flags) \ do { \ typecheck(u8 *, recursion_ptr); \ (*(recursion_ptr))--; \ local_irq_restore(flags); \ } while (0) int printk_delay_msec __read_mostly; static inline void printk_delay(int level) { boot_delay_msec(level); if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; while (m--) { mdelay(1); touch_nmi_watchdog(); } } } static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : 0x80000000 + smp_processor_id(); } /** * printk_parse_prefix - Parse level and control flags. * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. * @flags: A pointer to the current printk_info flags, will be updated. * * @level may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @level must be set to * LOGLEVEL_DEFAULT in order to be updated with the parsed value. * * @flags may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @flags will be OR'd with the parsed * value. * * Return: The length of the parsed level and control flags. */ u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags) { u16 prefix_len = 0; int kern_level; while (*text) { kern_level = printk_get_level(text); if (!kern_level) break; switch (kern_level) { case '0' ... '7': if (level && *level == LOGLEVEL_DEFAULT) *level = kern_level - '0'; break; case 'c': /* KERN_CONT */ if (flags) *flags |= LOG_CONT; } prefix_len += 2; text += 2; } return prefix_len; } __printf(5, 0) static u16 printk_sprint(char *text, u16 size, int facility, enum printk_info_flags *flags, const char *fmt, va_list args) { u16 text_len; text_len = vscnprintf(text, size, fmt, args); /* Mark and strip a trailing newline. */ if (text_len && text[text_len - 1] == '\n') { text_len--; *flags |= LOG_NEWLINE; } /* Strip log level and control flags. */ if (facility == 0) { u16 prefix_len; prefix_len = printk_parse_prefix(text, NULL, NULL); if (prefix_len) { text_len -= prefix_len; memmove(text, text + prefix_len, text_len); } } trace_console(text, text_len); return text_len; } __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; unsigned long irqflags; u16 trunc_msg_len = 0; char prefix_buf[8]; u8 *recursion_ptr; u16 reserve_size; va_list args2; u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; if (!printk_enter_irqsave(recursion_ptr, irqflags)) return 0; /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is * close to the call of printk(). This provides a more deterministic * timestamp with respect to the caller. */ ts_nsec = local_clock(); caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be * passed in as a parameter. An extra byte must be reserved so that * later the vscnprintf() into the reserved buffer has room for the * terminating '\0', which is not counted by vsnprintf(). */ va_copy(args2, args); reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; va_end(args2); if (reserve_size > PRINTKRB_RECORD_MAX) reserve_size = PRINTKRB_RECORD_MAX; /* Extract log level or control flags. */ if (facility == 0) printk_parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dev_info) flags |= LOG_NEWLINE; if (is_printk_force_console()) flags |= LOG_FORCE_CON; if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, facility, &flags, fmt, args); r.info->text_len += text_len; if (flags & LOG_FORCE_CON) r.info->flags |= LOG_FORCE_CON; if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); } else { prb_commit(&e); } ret = text_len; goto out; } } /* * Explicitly initialize the record before every prb_reserve() call. * prb_reserve_in_last() and prb_reserve() purposely invalidate the * structure when they fail. */ prb_rec_init_wr(&r, reserve_size); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&reserve_size, &trunc_msg_len); prb_rec_init_wr(&r, reserve_size + trunc_msg_len); if (!prb_reserve(&e, prb, &r)) goto out; } /* fill message */ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); ret = text_len + trunc_msg_len; out: printk_exit_irqrestore(recursion_ptr, irqflags); return ret; } /* * This acts as a one-way switch to allow legacy consoles to print from * the printk() caller context on a panic CPU. It also attempts to flush * the legacy consoles in this context. */ void printk_legacy_allow_panic_sync(void) { struct console_flush_type ft; legacy_allow_panic_sync = true; printk_get_console_flush_type(&ft); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct console_flush_type ft; int printed_len; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) return 0; /* * The messages on the panic CPU are the most important. If * non-panic CPUs are generating any messages, they will be * silently dropped. */ if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) return 0; printk_get_console_flush_type(&ft); /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; ft.legacy_offload |= ft.legacy_direct; ft.legacy_direct = false; } printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_direct) { /* * The caller may be holding system-critical or * timing-sensitive locks. Disable preemption during * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. */ preempt_disable(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers. With the * spinning variant, this context tries to take over the * printing from another printing context. */ if (console_trylock_spinning()) console_unlock(); preempt_enable(); } if (ft.legacy_offload) defer_console_output(); else wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); int vprintk_default(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk(fmt, args); va_end(args); return r; } EXPORT_SYMBOL(_printk); static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 #define prb_next_seq(rb) 0 static u64 syslog_seq; static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK struct console *early_console; asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; char buf[512]; int n; if (!early_console) return; va_start(ap, fmt); n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); early_console->write(early_console, buf, n); } #endif static void set_user_specified(struct console_cmdline *c, bool user_specified) { if (!user_specified) return; /* * @c console was defined by the user on the command line. * Do not clear when added twice also by SPCR or the device tree. */ c->user_specified = true; /* At least one console defined by the user on the command line. */ console_set_on_cmdline = 1; } static int __add_preferred_console(const char *name, const short idx, const char *devname, char *options, char *brl_options, bool user_specified) { struct console_cmdline *c; int i; if (!name && !devname) return -EINVAL; /* * We use a signed short index for struct console for device drivers to * indicate a not yet assigned index or port. However, a negative index * value is not valid when the console name and index are defined on * the command line. */ if (name && idx < 0) return -EINVAL; /* * See if this tty is not yet registered, and * if we have a slot free. */ for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if ((name && strcmp(c->name, name) == 0 && c->index == idx) || (devname && strcmp(c->devname, devname) == 0)) { if (!brl_options) preferred_console = i; set_user_specified(c, user_specified); return 0; } } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) preferred_console = i; if (name) strscpy(c->name, name); if (devname) strscpy(c->devname, devname); c->options = options; set_user_specified(c, user_specified); braille_set_options(c, brl_options); c->index = idx; return 0; } static int __init console_msg_format_setup(char *str) { if (!strcmp(str, "syslog")) console_msg_format = MSG_FORMAT_SYSLOG; if (!strcmp(str, "default")) console_msg_format = MSG_FORMAT_DEFAULT; return 1; } __setup("console_msg_format=", console_msg_format_setup); /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4); char buf[sizeof(console_cmdline[0].devname)]; char *brl_options = NULL; char *ttyname = NULL; char *devname = NULL; char *options; char *s; int idx; /* * console="" or console=null have been suggested as a way to * disable console output. Use ttynull that has been created * for exactly this purpose. */ if (str[0] == 0 || strcmp(str, "null") == 0) { __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true); return 1; } if (_braille_console_setup(&str, &brl_options)) return 1; /* For a DEVNAME:0.0 style console the character device is unknown early */ if (strchr(str, ':')) devname = buf; else ttyname = buf; /* * Decode str into name, index, options. */ if (ttyname && isdigit(str[0])) scnprintf(buf, sizeof(buf), "ttyS%s", str); else strscpy(buf, str); options = strchr(str, ','); if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) strscpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) strscpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) if ((ttyname && isdigit(*s)) || *s == ',') break; /* @idx will get defined when devname matches. */ if (devname) idx = -1; else idx = simple_strtoul(s, NULL, 10); *s = 0; __add_preferred_console(ttyname, idx, devname, options, brl_options, true); return 1; } __setup("console=", console_setup); /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name * @idx: device index * @options: options for this console * * The last preferred console added will be used for kernel messages * and stdin/out/err for init. Normally this is used by console_setup * above to handle user-supplied console arguments; however it can also * be used by arch-specific code either to override the user or more * commonly to provide a default console (ie from PROM variables) when * the user has not supplied one. */ int add_preferred_console(const char *name, const short idx, char *options) { return __add_preferred_console(name, idx, NULL, options, NULL, false); } /** * match_devname_and_update_preferred_console - Update a preferred console * when matching devname is found. * @devname: DEVNAME:0.0 style device name * @name: Name of the corresponding console driver, e.g. "ttyS" * @idx: Console index, e.g. port number. * * The function checks whether a device with the given @devname is * preferred via the console=DEVNAME:0.0 command line option. * It fills the missing console driver name and console index * so that a later register_console() call could find (match) * and enable this device. * * It might be used when a driver subsystem initializes particular * devices with already known DEVNAME:0.0 style names. And it * could predict which console driver name and index this device * would later get associated with. * * Return: 0 on success, negative error code on failure. */ int match_devname_and_update_preferred_console(const char *devname, const char *name, const short idx) { struct console_cmdline *c = console_cmdline; int i; if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0) return -EINVAL; for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if (!strcmp(devname, c->devname)) { pr_info("associate the preferred console \"%s\" with \"%s%d\"\n", devname, name, idx); strscpy(c->name, name); c->index = idx; return 0; } } return -ENOENT; } EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console); bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); module_param_named(console_suspend, console_suspend_enabled, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(console_suspend, "suspend console during suspend" " and hibernate operations"); static bool printk_console_no_auto_verbose; void console_verbose(void) { if (console_loglevel && !printk_console_no_auto_verbose) console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } EXPORT_SYMBOL_GPL(console_verbose); module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644); MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** * suspend_console - suspend the console subsystem * * This disables printk() while we go into suspend states */ void suspend_console(void) { struct console *con; if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); pr_flush(1000, true); console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags | CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see that they are suspended so that it * is guaranteed that all printing has stopped when this function * completes. */ synchronize_srcu(&console_srcu); } void resume_console(void) { struct console_flush_type ft; struct console *con; if (!console_suspend_enabled) return; console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see they are no longer suspended so * that they are guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); pr_flush(1000, true); } /** * console_cpu_notify - print deferred console messages after CPU hotplug * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be printed on the console only if there are CON_ANYTIME consoles. * This function is called when a new CPU comes online (or fails to come * up) or goes offline. */ static int console_cpu_notify(unsigned int cpu) { struct console_flush_type ft; if (!cpuhp_tasks_frozen) { printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } return 0; } /** * console_lock - block the console subsystem from printing * * Acquires a lock which guarantees that no consoles will * be in or enter their write() callback. * * Can sleep, returns nothing. */ void console_lock(void) { might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ while (other_cpu_in_panic()) msleep(1000); down_console_sem(); console_locked = 1; console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); /** * console_trylock - try to block the console subsystem from printing * * Try to acquire a lock which guarantees that no consoles will * be in or enter their write() callback. * * returns 1 on success, and 0 on failure to acquire the lock. */ int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ if (other_cpu_in_panic()) return 0; if (down_trylock_console_sem()) return 0; console_locked = 1; console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { return console_locked; } EXPORT_SYMBOL(is_console_locked); static void __console_unlock(void) { console_locked = 0; up_console_sem(); } #ifdef CONFIG_PRINTK /* * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting * the existing message over and inserting the scratchbuf message. * * @pmsg is the original printk message. * @fmt is the printf format of the message which will prepend the existing one. * * If there is not enough space in @pmsg->pbufs->outbuf, the existing * message text will be sufficiently truncated. * * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ __printf(2, 3) static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; va_list args; size_t len; va_start(args, fmt); len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args); va_end(args); /* * Make sure outbuf is sufficiently large before prepending. * Keep at least the prefix when the message must be truncated. * It is a rather theoretical problem when someone tries to * use a minimalist buffer. */ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz)) return; if (pmsg->outbuf_len + len >= outbuf_sz) { /* Truncate the message, but keep it terminated. */ pmsg->outbuf_len = outbuf_sz - (len + 1); outbuf[pmsg->outbuf_len] = 0; } memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1); memcpy(outbuf, scratchbuf, len); pmsg->outbuf_len += len; } /* * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. * * @dropped is the dropped count to report in the dropped message. */ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) { console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped); } /* * Prepend the message in @pmsg->pbufs->outbuf with a "replay message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. */ void console_prepend_replay(struct printk_message *pmsg) { console_prepend_message(pmsg, "** replaying previous printk message **\n"); } /* * Read and format the specified record (or a later record if the specified * record is not available). * * @pmsg will contain the formatted result. @pmsg->pbufs must point to a * struct printk_buffers. * * @seq is the record to read and format. If it is not available, the next * valid record is read. * * @is_extended specifies if the message should be formatted for extended * console output. * * @may_supress specifies if records may be skipped based on loglevel. * * Returns false if no record is available. Otherwise true and all fields * of @pmsg are valid. (See the documentation of struct printk_message * for information about the @pmsg fields.) */ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_suppress) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; struct printk_info info; struct printk_record r; size_t len = 0; bool force_con; /* * Formatting extended messages requires a separate buffer, so use the * scratch buffer to read in the ringbuffer text. * * Formatting normal messages is done in-place, so read the ringbuffer * text directly into the output buffer. */ if (is_extended) prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz); else prb_rec_init_rd(&r, &info, outbuf, outbuf_sz); if (!prb_read_valid(prb, seq, &r)) return false; pmsg->seq = r.info->seq; pmsg->dropped = r.info->seq - seq; force_con = r.info->flags & LOG_FORCE_CON; /* * Skip records that are not forced to be printed on consoles and that * has level above the console loglevel. */ if (!force_con && may_suppress && suppress_message_printing(r.info->level)) goto out; if (is_extended) { len = info_print_ext_header(outbuf, outbuf_sz, r.info); len += msg_print_ext_body(outbuf + len, outbuf_sz - len, &r.text_buf[0], r.info->text_len, &r.info->dev_info); } else { len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } out: pmsg->outbuf_len = len; return true; } /* * Legacy console printing from printk() caller context does not respect * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a * false positive. For PREEMPT_RT the false positive condition does not * occur. * * This map is used to temporarily establish LD_WAIT_SLEEP context for the * console write() callback when legacy printing to avoid false positive * lockdep complaints, thus allowing lockdep to continue to function for * real issues. */ #ifdef CONFIG_PREEMPT_RT static inline void printk_legacy_allow_spinlock_enter(void) { } static inline void printk_legacy_allow_spinlock_exit(void) { } #else static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); static inline void printk_legacy_allow_spinlock_enter(void) { lock_map_acquire_try(&printk_legacy_map); } static inline void printk_legacy_allow_spinlock_exit(void) { lock_map_release(&printk_legacy_map); } #endif /* CONFIG_PREEMPT_RT */ /* * Used as the printk buffers for non-panic, serialized console printing. * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. * Its usage requires the console_lock held. */ struct printk_buffers printk_shared_pbufs; /* * Print one record for the given console. The record printed is whatever * record is the next available record for the given console. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding both the * console_lock and the SRCU read lock. Otherwise it is set to false. * * @cookie is the cookie from the SRCU read lock. * * Returns false if the given console has no next record to print, otherwise * true. * * Requires the console_lock and the SRCU read lock. */ static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; char *outbuf = &printk_shared_pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &printk_shared_pbufs, }; unsigned long flags; *handover = false; if (!printk_get_next_message(&pmsg, con->seq, is_extended, true)) return false; con->dropped += pmsg.dropped; /* Skip messages of formatted length 0. */ if (pmsg.outbuf_len == 0) { con->seq = pmsg.seq + 1; goto skip; } if (con->dropped && !is_extended) { console_prepend_dropped(&pmsg, con->dropped); con->dropped = 0; } /* Write everything out to the hardware. */ if (force_legacy_kthread() && !panic_in_progress()) { /* * With forced threading this function is in a task context * (either legacy kthread or get_init_console_seq()). There * is no need for concern about printk reentrance, handovers, * or lockdep complaints. */ con->write(con, outbuf, pmsg.outbuf_len); con->seq = pmsg.seq + 1; } else { /* * While actively printing out messages, if another printk() * were to occur on another CPU, it may wait for this one to * finish. This task can not be preempted if there is a * waiter waiting to take over. * * Interrupts are disabled because the hand over to a waiter * must not be interrupted until the hand over is completed * (@console_waiter is cleared). */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); /* Do not trace print latency. */ stop_critical_timings(); printk_legacy_allow_spinlock_enter(); con->write(con, outbuf, pmsg.outbuf_len); printk_legacy_allow_spinlock_exit(); start_critical_timings(); con->seq = pmsg.seq + 1; *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); } skip: return true; } #else static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { *handover = false; return false; } static inline void printk_kthreads_check_locked(void) { } #endif /* CONFIG_PRINTK */ /* * Print out all remaining records to all consoles. * * @do_cond_resched is set by the caller. It can be true only in schedulable * context. * * @next_seq is set to the sequence number after the last available record. * The value is valid only when this function returns true. It means that all * usable consoles are completely flushed. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the * console_lock. Otherwise it is set to false. * * Returns true when there was at least one usable console and all messages * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this * is not the panic CPU). Regardless the reason, the caller should assume it * is not useful to immediately try again. * * Requires the console_lock. */ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) { struct console_flush_type ft; bool any_usable = false; struct console *con; bool any_progress; int cookie; *next_seq = 0; *handover = false; do { any_progress = false; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; bool progress; /* * console_flush_all() is only responsible for nbcon * consoles when the nbcon consoles cannot print via * their atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, !do_cond_resched)) continue; any_usable = true; if (flags & CON_NBCON) { progress = nbcon_legacy_emit_next_record(con, handover, cookie, !do_cond_resched); printk_seq = nbcon_seq_read(con); } else { progress = console_emit_next_record(con, handover, cookie); printk_seq = con->seq; } /* * If a handover has occurred, the SRCU read lock * is already released. */ if (*handover) return false; /* Track the next of the highest seq flushed. */ if (printk_seq > *next_seq) *next_seq = printk_seq; if (!progress) continue; any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ if (other_cpu_in_panic()) goto abandon; if (do_cond_resched) cond_resched(); } console_srcu_read_unlock(cookie); } while (any_progress); return any_usable; abandon: console_srcu_read_unlock(cookie); return false; } static void __console_flush_and_unlock(void) { bool do_cond_resched; bool handover; bool flushed; u64 next_seq; /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may * end up dumping a lot of lines, for example, if called from * console registration path, and should invoke cond_resched() * between lines if allowable. Not doing so can cause a very long * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more * messages practically incapacitating the system. Therefore, create * a local to use for the printing loop. */ do_cond_resched = console_may_schedule; do { console_may_schedule = 0; flushed = console_flush_all(do_cond_resched, &next_seq, &handover); if (!handover) __console_unlock(); /* * Abort if there was a failure to flush all messages to all * usable consoles. Either it is not possible to flush (in * which case it would be an infinite loop of retrying) or * another context has taken over printing. */ if (!flushed) break; /* * Some context may have added new records after * console_flush_all() but before unlocking the console. * Re-check if there is a new record to flush. If the trylock * fails, another context is already handling the printing. */ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } /** * console_unlock - unblock the legacy console subsystem from printing * * Releases the console_lock which the caller holds to block printing of * the legacy console subsystem. * * While the console_lock was held, console output may have been buffered * by printk(). If this is the case, console_unlock() emits the output on * legacy consoles prior to releasing the lock. * * console_unlock(); may be called from any context. */ void console_unlock(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (ft.legacy_direct) __console_flush_and_unlock(); else __console_unlock(); } EXPORT_SYMBOL(console_unlock); /** * console_conditional_schedule - yield the CPU if required * * If the console code is currently allowed to sleep, and * if this CPU should yield the CPU to another task, do * so here. * * Must be called within console_lock();. */ void __sched console_conditional_schedule(void) { if (console_may_schedule) cond_resched(); } EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { bool found_unblank = false; struct console *c; int cookie; /* * First check if there are any consoles implementing the unblank() * callback. If not, there is no reason to continue and take the * console lock, which in particular can be dangerous if * @oops_in_progress is set. */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { found_unblank = true; break; } } console_srcu_read_unlock(cookie); if (!found_unblank) return; /* * Stop console printing because the unblank() callback may * assume the console is not within its write() callback. * * If @oops_in_progress is set, this may be an atomic context. * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { /* Semaphores are not NMI-safe. */ if (in_nmi()) return; /* * Attempting to trylock the console lock can deadlock * if another CPU was stopped while modifying the * semaphore. "Hope and pray" that this is not the * current situation. */ if (down_trylock_console_sem() != 0) return; } else console_lock(); console_locked = 1; console_may_schedule = 0; cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); console_unlock(); if (!oops_in_progress) pr_flush(1000, true); } /* * Rewind all consoles to the oldest available record. * * IMPORTANT: The function is safe only when called under * console_lock(). It is not enforced because * it is used as a best effort in panic(). */ static void __console_rewind_all(void) { struct console *c; short flags; int cookie; u64 seq; seq = prb_first_valid_seq(prb); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { flags = console_srcu_read_flags(c); if (flags & CON_NBCON) { nbcon_seq_force(c, seq); } else { /* * This assignment is safe only when called under * console_lock(). On panic, legacy consoles are * only best effort. */ c->seq = seq; } } console_srcu_read_unlock(cookie); } /** * console_flush_on_panic - flush console content on panic * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ void console_flush_on_panic(enum con_flush_mode mode) { struct console_flush_type ft; bool handover; u64 next_seq; /* * Ignore the console lock and flush out the messages. Attempting a * trylock would not be useful because: * * - if it is contended, it must be ignored anyway * - console_lock() and console_trylock() block and fail * respectively in panic for non-panic CPUs * - semaphores are not NMI-safe */ /* * If another context is holding the console lock, * @console_may_schedule might be set. Clear it so that * this context does not call cond_resched() while flushing. */ console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) __console_rewind_all(); printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); /* Flush legacy consoles once allowed, even when dangerous. */ if (legacy_allow_panic_sync) console_flush_all(false, &next_seq, &handover); } /* * Return the console tty driver structure and its associated index */ struct tty_driver *console_device(int *index) { struct console *c; struct tty_driver *driver = NULL; int cookie; /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (!c->device) continue; driver = c->device(c, index); if (driver) break; } console_srcu_read_unlock(cookie); console_unlock(); return driver; } /* * Prevent further output on the passed console device so that (for example) * serial drivers can disable console output before suspending a port, and can * re-enable output afterwards. */ void console_stop(struct console *console) { __pr_flush(console, 1000, true); console_list_lock(); console_srcu_write_flags(console, console->flags & ~CON_ENABLED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All contexts must * be able to see that this console is disabled so that (for example) * the caller can suspend the port without risk of another context * using the port. */ synchronize_srcu(&console_srcu); } EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { struct console_flush_type ft; bool is_nbcon; console_list_lock(); console_srcu_write_flags(console, console->flags | CON_ENABLED); is_nbcon = console->flags & CON_NBCON; console_list_unlock(); /* * Ensure that all SRCU list walks have completed. The related * printing context must be able to see it is enabled so that * it is guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (is_nbcon && ft.nbcon_offload) nbcon_kthread_wake(console); else if (ft.legacy_offload) defer_console_output(); __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); #ifdef CONFIG_PRINTK static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ static bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; static bool legacy_kthread_should_wakeup(void) { struct console_flush_type ft; struct console *con; bool ret = false; int cookie; if (kthread_should_stop()) return true; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; /* * The legacy printer thread is only responsible for nbcon * consoles when the nbcon consoles cannot print via their * atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, false)) continue; if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(con); } else { /* * It is safe to read @seq because only this * thread context updates @seq. */ printk_seq = con->seq; } if (prb_read_valid(prb, printk_seq, NULL)) { ret = true; break; } } console_srcu_read_unlock(cookie); return ret; } static int legacy_kthread_func(void *unused) { for (;;) { wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); if (kthread_should_stop()) break; console_lock(); __console_flush_and_unlock(); } return 0; } static bool legacy_kthread_create(void) { struct task_struct *kt; lockdep_assert_console_list_lock_held(); kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy"); if (WARN_ON(IS_ERR(kt))) { pr_err("failed to start legacy printing thread\n"); return false; } printk_legacy_kthread = kt; /* * It is important that console printing threads are scheduled * shortly after a printk call and with generous runtime budgets. */ sched_set_normal(printk_legacy_kthread, -20); return true; } /** * printk_kthreads_shutdown - shutdown all threaded printers * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ static void printk_kthreads_shutdown(void) { struct console *con; console_list_lock(); if (printk_kthreads_running) { printk_kthreads_running = false; for_each_console(con) { if (con->flags & CON_NBCON) nbcon_kthread_stop(con); } /* * The threads may have been stopped while printing a * backlog. Flush any records left over. */ nbcon_atomic_flush_pending(); } console_list_unlock(); } static struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. * * Must be called under console_list_lock(). */ static void printk_kthreads_check_locked(void) { struct hlist_node *tmp; struct console *con; lockdep_assert_console_list_lock_held(); if (!printk_kthreads_ready) return; if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && !legacy_kthread_create()) { /* * All legacy consoles must be unregistered. If there * are any nbcon consoles, they will set up their own * kthread. */ hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_NBCON) continue; unregister_console_locked(con); } } } else if (printk_legacy_kthread) { kthread_stop(printk_legacy_kthread); printk_legacy_kthread = NULL; } /* * Printer threads cannot be started as long as any boot console is * registered because there is no way to synchronize the hardware * registers between boot console code and regular console code. * It can only be known that there will be no new boot consoles when * an nbcon console is registered. */ if (have_boot_console || !have_nbcon_console) { /* Clear flag in case all nbcon consoles unregistered. */ printk_kthreads_running = false; return; } if (printk_kthreads_running) return; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_NBCON)) continue; if (!nbcon_kthread_create(con)) unregister_console_locked(con); } printk_kthreads_running = true; } static int __init printk_set_kthreads_ready(void) { register_syscore_ops(&printk_syscore_ops); console_list_lock(); printk_kthreads_ready = true; printk_kthreads_check_locked(); console_list_unlock(); return 0; } early_initcall(printk_set_kthreads_ready); #endif /* CONFIG_PRINTK */ static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) { keep_bootcon = 1; pr_info("debug: skip boot console de-registration.\n"); return 0; } early_param("keep_bootcon", keep_bootcon_setup); static int console_call_setup(struct console *newcon, char *options) { int err; if (!newcon->setup) return 0; /* Synchronize with possible boot console. */ console_lock(); err = newcon->setup(newcon, options); console_unlock(); return err; } /* * This is called by register_console() to try to match * the newly registered console with any of the ones selected * by either the command line or add_preferred_console() and * setup/enable it. * * Care need to be taken with consoles that are statically * enabled such as netconsole */ static int try_enable_preferred_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; int i, err; for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { /* Console not yet initialized? */ if (!c->name[0]) continue; if (c->user_specified != user_specified) continue; if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && newcon->index != c->index) continue; if (newcon->index < 0) newcon->index = c->index; if (_braille_register_console(newcon, c)) return 0; err = console_call_setup(newcon, c->options); if (err) return err; } newcon->flags |= CON_ENABLED; if (i == preferred_console) newcon->flags |= CON_CONSDEV; return 0; } /* * Some consoles, such as pstore and netconsole, can be enabled even * without matching. Accept the pre-enabled consoles only when match() * and setup() had a chance to be called. */ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; return -ENOENT; } /* Try to enable the console unconditionally */ static void try_enable_default_console(struct console *newcon) { if (newcon->index < 0) newcon->index = 0; if (console_call_setup(newcon, NULL) != 0) return; newcon->flags |= CON_ENABLED; if (newcon->device) newcon->flags |= CON_CONSDEV; } /* Return the starting sequence number for a newly registered console. */ static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered) { struct console *con; bool handover; u64 init_seq; if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); init_seq = syslog_seq; mutex_unlock(&syslog_lock); } else { /* Begin with next message added to ringbuffer. */ init_seq = prb_next_seq(prb); /* * If any enabled boot consoles are due to be unregistered * shortly, some may not be caught up and may be the same * device as @newcon. Since it is not known which boot console * is the same device, flush all consoles and, if necessary, * start with the message of the enabled boot console that is * the furthest behind. */ if (bootcon_registered && !keep_bootcon) { /* * Hold the console_lock to stop console printing and * guarantee safe access to console->seq. */ console_lock(); /* * Flush all consoles and set the console to start at * the next unprinted sequence number. */ if (!console_flush_all(true, &init_seq, &handover)) { /* * Flushing failed. Just choose the lowest * sequence of the enabled boot consoles. */ /* * If there was a handover, this context no * longer holds the console_lock. */ if (handover) console_lock(); init_seq = prb_next_seq(prb); for_each_console(con) { u64 seq; if (!(con->flags & CON_BOOT) || !(con->flags & CON_ENABLED)) { continue; } if (con->flags & CON_NBCON) seq = nbcon_seq_read(con); else seq = con->seq; if (seq < init_seq) init_seq = seq; } } console_unlock(); } } return init_seq; } #define console_first() \ hlist_entry(console_list.first, struct console, node) static int unregister_console_locked(struct console *console); /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the * console driver was initialized. * * This can happen pretty early during the boot process (because of * early_printk) - sometimes before setup_arch() completes - be careful * of what kernel features are used - they may not be initialised yet. * * There are two types of consoles - bootconsoles (early_printk) and * "real" consoles (everything which is not a bootconsole) which are * handled differently. * - Any number of bootconsoles can be registered at any time. * - As soon as a "real" console is registered, all bootconsoles * will be unregistered automatically. * - Once a "real" console is registered, any attempt to register a * bootconsoles will be rejected */ void register_console(struct console *newcon) { bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic; bool bootcon_registered = false; bool realcon_registered = false; struct console *con; unsigned long flags; u64 init_seq; int err; console_list_lock(); for_each_console(con) { if (WARN(con == newcon, "console '%s%d' already registered\n", con->name, con->index)) { goto unlock; } if (con->flags & CON_BOOT) bootcon_registered = true; else realcon_registered = true; } /* Do not register boot consoles when there already is a real one. */ if ((newcon->flags & CON_BOOT) && realcon_registered) { pr_info("Too late to register bootconsole %s%d\n", newcon->name, newcon->index); goto unlock; } if (newcon->flags & CON_NBCON) { /* * Ensure the nbcon console buffers can be allocated * before modifying any global data. */ if (!nbcon_alloc(newcon)) goto unlock; } /* * See if we want to enable this console driver by default. * * Nope when a console is preferred by the command line, device * tree, or SPCR. * * The first real console with tty binding (driver) wins. More * consoles might get enabled before the right one is found. * * Note that a console with tty binding will have CON_CONSDEV * flag set and will be first in the list. */ if (preferred_console < 0) { if (hlist_empty(&console_list) || !console_first()->device || console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); } } /* See if this console matches one we selected on the command line */ err = try_enable_preferred_console(newcon, true); /* If not, try to match against the platform default(s) */ if (err == -ENOENT) err = try_enable_preferred_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) { if (newcon->flags & CON_NBCON) nbcon_free(newcon); goto unlock; } /* * If we have a bootconsole, and are switching to a real console, * don't print everything out again, since when the boot console, and * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; } newcon->dropped = 0; init_seq = get_init_console_seq(newcon, bootcon_registered); if (newcon->flags & CON_NBCON) { have_nbcon_console = true; nbcon_seq_force(newcon, init_seq); } else { have_legacy_console = true; newcon->seq = init_seq; } if (newcon->flags & CON_BOOT) have_boot_console = true; /* * If another context is actively using the hardware of this new * console, it will not be aware of the nbcon synchronization. This * is a risk that two contexts could access the hardware * simultaneously if this new console is used for atomic printing * and the other context is still using the hardware. * * Use the driver synchronization to ensure that the hardware is not * in use while this new console transitions to being registered. */ if (use_device_lock) newcon->device_lock(newcon, &flags); /* * Put this console in the list - keep the * preferred driver at the head of the list. */ if (hlist_empty(&console_list)) { /* Ensure CON_CONSDEV is always set for the head. */ newcon->flags |= CON_CONSDEV; hlist_add_head_rcu(&newcon->node, &console_list); } else if (newcon->flags & CON_CONSDEV) { /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV); hlist_add_head_rcu(&newcon->node, &console_list); } else { hlist_add_behind_rcu(&newcon->node, console_list.first); } /* * No need to synchronize SRCU here! The caller does not rely * on all contexts being able to see the new console before * register_console() completes. */ /* This new console is now registered. */ if (use_device_lock) newcon->device_unlock(newcon, flags); console_sysfs_notify(); /* * By unregistering the bootconsoles after we enable the real console * we get the "console xxx enabled" message on all the consoles - * boot consoles, real consoles, etc - this is to ensure that end * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ con_printk(KERN_INFO, newcon, "enabled\n"); if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { struct hlist_node *tmp; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_BOOT) unregister_console_locked(con); } } /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); unlock: console_list_unlock(); } EXPORT_SYMBOL(register_console); /* Must be called under console_list_lock(). */ static int unregister_console_locked(struct console *console) { bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic; bool found_legacy_con = false; bool found_nbcon_con = false; bool found_boot_con = false; unsigned long flags; struct console *c; int res; lockdep_assert_console_list_lock_held(); con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); if (res < 0) return res; if (res > 0) return 0; if (!console_is_registered_locked(console)) res = -ENODEV; else if (console_is_usable(console, console->flags, true)) __pr_flush(console, 1000, true); /* Disable it unconditionally */ console_srcu_write_flags(console, console->flags & ~CON_ENABLED); if (res < 0) return res; /* * Use the driver synchronization to ensure that the hardware is not * in use while this console transitions to being unregistered. */ if (use_device_lock) console->device_lock(console, &flags); hlist_del_init_rcu(&console->node); if (use_device_lock) console->device_unlock(console, flags); /* * <HISTORICAL> * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. * </HISTORICAL> * * The above makes no sense as there is no guarantee that the next * console has any device attached. Oh well.... */ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV) console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV); /* * Ensure that all SRCU list walks have completed. All contexts * must not be able to see this console in the list so that any * exit/cleanup routines can be performed safely. */ synchronize_srcu(&console_srcu); if (console->flags & CON_NBCON) nbcon_free(console); console_sysfs_notify(); if (console->exit) res = console->exit(console); /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. */ for_each_console(c) { if (c->flags & CON_BOOT) found_boot_con = true; if (c->flags & CON_NBCON) found_nbcon_con = true; else found_legacy_con = true; } if (!found_boot_con) have_boot_console = found_boot_con; if (!found_legacy_con) have_legacy_console = found_legacy_con; if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); return res; } int unregister_console(struct console *console) { int res; console_list_lock(); res = unregister_console_locked(console); console_list_unlock(); return res; } EXPORT_SYMBOL(unregister_console); /** * console_force_preferred_locked - force a registered console preferred * @con: The registered console to force preferred. * * Must be called under console_list_lock(). */ void console_force_preferred_locked(struct console *con) { struct console *cur_pref_con; if (!console_is_registered_locked(con)) return; cur_pref_con = console_first(); /* Already preferred? */ if (cur_pref_con == con) return; /* * Delete, but do not re-initialize the entry. This allows the console * to continue to appear registered (via any hlist_unhashed_lockless() * checks), even though it was briefly removed from the console list. */ hlist_del_rcu(&con->node); /* * Ensure that all SRCU list walks have completed so that the console * can be added to the beginning of the console list and its forward * list pointer can be re-initialized. */ synchronize_srcu(&console_srcu); con->flags |= CON_CONSDEV; WARN_ON(!con->device); /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV); hlist_add_head_rcu(&con->node, &console_list); } EXPORT_SYMBOL(console_force_preferred_locked); /* * Initialize the console device. This is called *early*, so * we can't necessarily depend on lots of kernel help here. * Just do some early initializations, and do the complex setup * later. */ void __init console_init(void) { int ret; initcall_t call; initcall_entry_t *ce; /* Setup the default TTY line discipline. */ n_tty_init(); /* * set up the console device so that later boot sequences can * inform about problems etc.. */ ce = __con_initcall_start; trace_initcall_level("console"); while (ce < __con_initcall_end) { call = initcall_from_entry(ce); trace_initcall_start(call); ret = call(); trace_initcall_finish(call, ret); ce++; } } /* * Some boot consoles access data that is in the init section and which will * be discarded after the initcalls have been run. To make sure that no code * will access this data, unregister the boot consoles in a late initcall. * * If for some reason, such as deferred probe or the driver being a loadable * module, the real console hasn't registered yet at this point, there will * be a brief interval in which no messages are logged to the console, which * makes it difficult to diagnose problems that occur during this time. * * To mitigate this problem somewhat, only unregister consoles whose memory * intersects with the init section. Note that all other boot consoles will * get unregistered when the real preferred console is registered. */ static int __init printk_late_init(void) { struct hlist_node *tmp; struct console *con; int ret; console_list_lock(); hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_BOOT)) continue; /* Check addresses that might be used for enabled consoles. */ if (init_section_intersects(con, sizeof(*con)) || init_section_contains(con->write, 0) || init_section_contains(con->read, 0) || init_section_contains(con->device, 0) || init_section_contains(con->unblank, 0) || init_section_contains(con->data, 0)) { /* * Please, consider moving the reported consoles out * of the init section. */ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", con->name, con->index); unregister_console_locked(con); } } console_list_unlock(); ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); printk_sysctl_init(); return 0; } late_initcall(printk_late_init); #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms); unsigned long remaining_jiffies = timeout_jiffies; struct console_flush_type ft; struct console *c; u64 last_diff = 0; u64 printk_seq; short flags; int cookie; u64 diff; u64 seq; /* Sorry, pr_flush() will not work this early. */ if (system_state < SYSTEM_SCHEDULING) return false; might_sleep(); seq = prb_next_reserve_seq(prb); /* Flush the consoles so that records up to @seq are printed. */ printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { console_lock(); console_unlock(); } for (;;) { unsigned long begin_jiffies; unsigned long slept_jiffies; diff = 0; /* * Hold the console_lock to guarantee safe access to * console->seq. Releasing console_lock flushes more * records in case @seq is still not printed on all * usable consoles. * * Holding the console_lock is not necessary if there * are no legacy or boot consoles. However, such a * console could register at any time. Always hold the * console_lock as a precaution rather than * synchronizing against register_console(). */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (con && con != c) continue; flags = console_srcu_read_flags(c); /* * If consoles are not usable, it cannot be expected * that they make forward progress, so only increment * @diff for usable consoles. */ if (!console_is_usable(c, flags, true) && !console_is_usable(c, flags, false)) { continue; } if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(c); } else { printk_seq = c->seq; } if (printk_seq < seq) diff += seq - printk_seq; } console_srcu_read_unlock(cookie); if (diff != last_diff && reset_on_progress) remaining_jiffies = timeout_jiffies; console_unlock(); /* Note: @diff is 0 if there are no usable consoles. */ if (diff == 0 || remaining_jiffies == 0) break; /* msleep(1) might sleep much longer. Check time by jiffies. */ begin_jiffies = jiffies; msleep(1); slept_jiffies = jiffies - begin_jiffies; remaining_jiffies -= min(slept_jiffies, remaining_jiffies); last_diff = diff; } return (diff == 0); } /** * pr_flush() - Wait for printing threads to catch up. * * @timeout_ms: The maximum time (in ms) to wait. * @reset_on_progress: Reset the timeout if forward progress is seen. * * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 * represents infinite waiting. * * If @reset_on_progress is true, the timeout will be reset whenever any * printer has been seen to make some forward progress. * * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ static bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } /* * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_PENDING_WAKEUP 0x01 #define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { if (force_legacy_kthread()) { if (printk_legacy_kthread) wake_up_interruptible(&legacy_wait); } else { if (console_trylock()) console_unlock(); } } if (pending & PRINTK_PENDING_WAKEUP) wake_up_interruptible(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; preempt_disable(); /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the wait queue is empty. * * The full memory barrier within wq_has_sleeper() pairs with the full * memory barrier within set_current_state() of * prepare_to_wait_event(), which is called after ___wait_event() adds * the waiter but before it has checked the wait condition. * * This pairs with devkmsg_read:A and syslog_print:A. */ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ (val & PRINTK_PENDING_OUTPUT)) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } /** * wake_up_klogd - Wake kernel logging daemon * * Use this function when new records have been added to the ringbuffer * and the console printing of those records has already occurred or is * known to be handled by some other context. This function will only * wake the logging daemon. * * Context: Any context. */ void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } /** * defer_console_output - Wake kernel logging daemon and trigger * console printing in a deferred context * * Use this function when new records have been added to the ringbuffer, * this context is responsible for console printing those records, but * the current context is not allowed to perform the console printing. * Trigger an irq_work context to perform the console printing. This * function also wakes the logging daemon. * * Context: Any context. */ void defer_console_output(void) { /* * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) { defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); } int _printk_deferred(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_deferred(fmt, args); va_end(args); return r; } /* * printk rate limiting, lifted from the networking subsystem. * * This enforces a rate limit: not more than 10 kernel messages * every 5s to make a denial-of-service attack impossible. */ DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); int __printk_ratelimit(const char *func) { return ___ratelimit(&printk_ratelimit_state, func); } EXPORT_SYMBOL(__printk_ratelimit); /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state * @interval_msecs: minimum interval between prints * * printk_timed_ratelimit() returns true if more than @interval_msecs * milliseconds have elapsed since the last time printk_timed_ratelimit() * returned true. */ bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { unsigned long elapsed = jiffies - *caller_jiffies; if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) return false; *caller_jiffies = jiffies; return true; } EXPORT_SYMBOL(printk_timed_ratelimit); static DEFINE_SPINLOCK(dump_list_lock); static LIST_HEAD(dump_list); /** * kmsg_dump_register - register a kernel log dumper. * @dumper: pointer to the kmsg_dumper structure * * Adds a kernel log dumper to the system. The dump callback in the * structure will be called when the kernel oopses or panics and must be * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. */ int kmsg_dump_register(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EBUSY; /* The dump callback needs to be set */ if (!dumper->dump) return -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); /* Don't allow registering multiple times */ if (!dumper->registered) { dumper->registered = 1; list_add_tail_rcu(&dumper->list, &dump_list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_register); /** * kmsg_dump_unregister - unregister a kmsg dumper. * @dumper: pointer to the kmsg_dumper structure * * Removes a dump device from the system. Returns zero on success and * %-EINVAL otherwise. */ int kmsg_dump_unregister(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); if (dumper->registered) { dumper->registered = 0; list_del_rcu(&dumper->list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) { switch (reason) { case KMSG_DUMP_PANIC: return "Panic"; case KMSG_DUMP_OOPS: return "Oops"; case KMSG_DUMP_EMERG: return "Emergency"; case KMSG_DUMP_SHUTDOWN: return "Shutdown"; default: return "Unknown"; } } EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); /** * kmsg_dump_desc - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * @desc: a short string to describe what caused the panic or oops. Can be NULL * if no additional description is available. * * Call each of the registered dumper's dump() callback, which can * retrieve the kmsg records with kmsg_dump_get_line() or * kmsg_dump_get_buffer(). */ void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc) { struct kmsg_dumper *dumper; struct kmsg_dump_detail detail = { .reason = reason, .description = desc}; rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { enum kmsg_dump_reason max_reason = dumper->max_reason; /* * If client has not provided a specific max_reason, default * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. */ if (max_reason == KMSG_DUMP_UNDEF) { max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : KMSG_DUMP_OOPS; } if (reason > max_reason) continue; /* invoke dumper which will iterate over records */ dumper->dump(dumper, &detail); } rcu_read_unlock(); } /** * kmsg_dump_get_line - retrieve one kmsg log line * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to * @size: maximum size of the buffer * @len: length of line placed into buffer * * Start at the beginning of the kmsg buffer, with the oldest kmsg * record, and copy one record into the provided buffer. * * Consecutive calls will return the next available record moving * towards the end of the buffer with the youngest messages. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, char *line, size_t size, size_t *len) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; unsigned int line_count; struct printk_record r; size_t l = 0; bool ret = false; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; prb_rec_init_rd(&r, &info, line, size); /* Read text or count text lines? */ if (line) { if (!prb_read_valid(prb, iter->cur_seq, &r)) goto out; l = record_print_text(&r, syslog, printk_time); } else { if (!prb_read_valid_info(prb, iter->cur_seq, &info, &line_count)) { goto out; } l = get_record_print_text_size(&info, line_count, syslog, printk_time); } iter->cur_seq = r.info->seq + 1; ret = true; out: if (len) *len = l; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** * kmsg_dump_get_buffer - copy kmsg log lines * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @buf: buffer to copy the line to * @size: maximum size of the buffer * @len_out: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * * Consecutive calls will fill the buffer with the next block of * available older records, not including the earlier retrieved ones. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, char *buf, size_t size, size_t *len_out) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; struct printk_record r; u64 seq; u64 next_seq; size_t len = 0; bool ret = false; bool time = printk_time; if (!buf || !size) goto out; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ iter->cur_seq = info.seq; } } /* last entry */ if (iter->cur_seq >= iter->next_seq) goto out; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. Pass in size-1 * because this function (by way of record_print_text()) will * not write more than size-1 bytes of text into @buf. */ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, size - 1, syslog, time); /* * Next kmsg_dump_get_buffer() invocation will dump block of * older records stored right before this one. */ next_seq = seq; prb_rec_init_rd(&r, &info, buf, size); prb_for_each_record(seq, prb, seq, &r) { if (r.info->seq >= iter->next_seq) break; len += record_print_text(&r, syslog, time); /* Adjust record to store to remaining buffer space. */ prb_rec_init_rd(&r, &info, buf + len, size - len); } iter->next_seq = next_seq; ret = true; out: if (len_out) *len_out = len; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** * kmsg_dump_rewind - reset the iterator * @iter: kmsg dump iterator * * Reset the dumper's iterator so that kmsg_dump_get_line() and * kmsg_dump_get_buffer() can be called again and used multiple * times within the same dumper.dump() callback. */ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { iter->cur_seq = latched_seq_read_nolock(&clear_seq); iter->next_seq = prb_next_seq(prb); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); /** * console_try_replay_all - try to replay kernel log on consoles * * Try to obtain lock on console subsystem and replay all * available records in printk buffer on the consoles. * Does nothing if lock is not obtained. * * Context: Any, except for NMI. */ void console_try_replay_all(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (console_trylock()) { __console_rewind_all(); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); /* Consoles are flushed as part of console_unlock(). */ console_unlock(); } } #endif #ifdef CONFIG_SMP static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); /** * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant * spinning lock is not owned by any CPU. * * Context: Any context. */ void __printk_cpu_sync_wait(void) { do { cpu_relax(); } while (atomic_read(&printk_cpu_sync_owner) != -1); } EXPORT_SYMBOL(__printk_cpu_sync_wait); /** * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the * lock, this function succeeds immediately. * * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ int __printk_cpu_sync_try_get(void) { int cpu; int old; cpu = smp_processor_id(); /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with * __printk_cpu_sync_put:B. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_put:A can never read from * __printk_cpu_sync_try_get:B. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of this CPU */ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ void __printk_cpu_sync_put(void) { if (atomic_read(&printk_cpu_sync_nested)) { atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of the next CPU */ atomic_set_release(&printk_cpu_sync_owner, -1); /* LMM(__printk_cpu_sync_put:B) */ } EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 // SPDX-License-Identifier: GPL-2.0+ /* * ext4_jbd2.h * * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 * * Copyright 1998--1999 Red Hat corp --- All Rights Reserved * * Ext4-specific journaling extensions. */ #ifndef _EXT4_JBD2_H #define _EXT4_JBD2_H #include <linux/fs.h> #include <linux/jbd2.h> #include "ext4.h" #define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) /* Define the number of blocks we need to account to a transaction to * modify one block of data. * * We may have to touch one inode, one bitmap buffer, up to three * indirection blocks, the group and superblock summaries, and the data * block to complete the transaction. * * For extents-enabled fs we may have to allocate and modify up to * 5 levels of tree, data block (for each of these we need bitmap + group * summaries), root which is stored in the inode, sb */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ (ext4_has_feature_extents(sb) ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode * and the superblock, which are already accounted for. */ #define EXT4_XATTR_TRANS_BLOCKS 6U /* Define the minimum size for a transaction which modifies data. This * needs to take into account the fact that we may end up modifying two * quota files too (one for the group, one for the user quota). The * superblock only gets updated once, of course, so don't bother * counting that again for the quota updates. */ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT4_XATTR_TRANS_BLOCKS - 2 + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* * Define the number of metadata blocks we need to account to modify data. * * This include super block, inode block, quota blocks and xattr blocks */ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Define an arbitrary limit for the amount of data we will anticipate * writing to any given transaction. For unbounded transactions such as * write(2) and truncate(2) we can write more than this, but we always * start off at the maximum transaction size and grow the transaction * optimistically as we go. */ #define EXT4_MAX_TRANS_DATA 64U /* We break up a large truncate or write transaction once the handle's * buffer credits gets this low, we need either to extend the * transaction or to start a new one. Reserve enough space here for * inode, bitmap, superblock, group and indirection updates for at least * one block, plus two quota updates. Quota allocations are not * needed. */ #define EXT4_RESERVE_TRANS_BLOCKS 12U /* * Number of credits needed if we need to insert an entry into a * directory. For each new index block, we need 4 blocks (old index * block, new index block, bitmap block, bg summary). For normal * htree directories there are 2 levels; if the largedir feature * enabled it's 3 levels. */ #define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was * allocated so we need to update only data block */ #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0) /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_INIT_REWRITE) : 0) #define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_DEL_REWRITE) : 0) #else #define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif #define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) /* * Ext4 handle operation types -- for logging purposes */ #define EXT4_HT_MISC 0 #define EXT4_HT_INODE 1 #define EXT4_HT_WRITE_PAGE 2 #define EXT4_HT_MAP_BLOCKS 3 #define EXT4_HT_DIR 4 #define EXT4_HT_TRUNCATE 5 #define EXT4_HT_QUOTA 6 #define EXT4_HT_RESIZE 7 #define EXT4_HT_MIGRATE 8 #define EXT4_HT_MOVE_EXTENTS 9 #define EXT4_HT_XATTR 10 #define EXT4_HT_EXT_CONVERT 11 #define EXT4_HT_MAX 12 /** * struct ext4_journal_cb_entry - Base structure for callback information. * * This struct is a 'seed' structure for a using with your own callback * structs. If you are using callbacks you must allocate one of these * or another struct of your own definition which has this struct * as it's first element and pass it to ext4_journal_callback_add(). */ struct ext4_journal_cb_entry { /* list information for other callbacks attached to the same handle */ struct list_head jce_list; /* Function to call with this callback structure */ void (*jce_func)(struct super_block *sb, struct ext4_journal_cb_entry *jce, int error); /* user data goes here */ }; /** * ext4_journal_callback_add: add a function to call after transaction commit * @handle: active journal transaction handle to register callback on * @func: callback function to call after the transaction has committed: * @sb: superblock of current filesystem for transaction * @jce: returned journal callback data * @rc: journal state at commit (0 = transaction committed properly) * @jce: journal callback data (internal and function private data struct) * * The registered function will be called in the context of the journal thread * after the transaction for which the handle was created has completed. * * No locks are held when the callback function is called, so it is safe to * call blocking functions from within the callback, but the callback should * not block or run for too long, or the filesystem will be blocked waiting for * the next transaction to commit. No journaling functions can be used, or * there is a risk of deadlock. * * There is no guaranteed calling order of multiple registered callbacks on * the same transaction. */ static inline void _ext4_journal_callback_add(handle_t *handle, struct ext4_journal_cb_entry *jce) { /* Add the jce to transaction's private list */ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); } static inline void ext4_journal_callback_add(handle_t *handle, void (*func)(struct super_block *sb, struct ext4_journal_cb_entry *jce, int rc), struct ext4_journal_cb_entry *jce) { struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); /* Add the jce to transaction's private list */ jce->jce_func = func; spin_lock(&sbi->s_md_lock); _ext4_journal_callback_add(handle, jce); spin_unlock(&sbi->s_md_lock); } /** * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister * Return true if object was successfully removed */ static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) { bool deleted; struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); spin_lock(&sbi->s_md_lock); deleted = !list_empty(&jce->jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); return deleted; } int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. */ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); #define ext4_mark_inode_dirty(__h, __i) \ __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__) int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, const char *func, unsigned int line); int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc); /* * Wrapper functions with which ext4 calls into JBD. */ int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type); int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type); int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh); #define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \ (bh), (trigger_type)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ (bh), (block_nr)) #define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \ __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \ (bh), (trigger_type)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) /* Note: Do not use this for NULL handles. This is only to determine if * a properly allocated handle is using a journal or not. */ static inline int ext4_handle_valid(handle_t *handle) { if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) return 0; return 1; } static inline void ext4_handle_sync(handle_t *handle) { if (ext4_handle_valid(handle)) handle->h_sync = 1; } static inline int ext4_handle_is_aborted(handle_t *handle) { if (ext4_handle_valid(handle)) return is_handle_aborted(handle); return 0; } static inline int ext4_free_metadata_revoke_credits(struct super_block *sb, int blocks) { /* Freeing each metadata block can result in freeing one cluster */ return blocks * EXT4_SB(sb)->s_cluster_ratio; } static inline int ext4_trans_default_revoke_credits(struct super_block *sb) { return ext4_free_metadata_revoke_credits(sb, 8); } #define ext4_journal_start_sb(sb, type, nblocks) \ __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\ ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \ ext4_trans_default_revoke_credits((inode)->i_sb)) #define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\ ext4_trans_default_revoke_credits((inode)->i_sb)) #define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \ __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \ (revoke_creds)) static inline handle_t *__ext4_journal_start(struct inode *inode, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks, rsv_blocks, revoke_creds); } #define ext4_journal_stop(handle) \ __ext4_journal_stop(__func__, __LINE__, (handle)) #define ext4_journal_start_reserved(handle, type) \ __ext4_journal_start_reserved((handle), __LINE__, (type)) handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type); static inline handle_t *ext4_journal_current_handle(void) { return journal_current_handle(); } static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) return jbd2_journal_extend(handle, nblocks, revoke); return 0; } static inline int ext4_journal_restart(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS); return 0; } int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, int extend_cred, int revoke_cred); /* * Ensure @handle has at least @check_creds credits available. If not, * transaction will be extended or restarted to contain at least @extend_cred * credits. Before restarting transaction @fn is executed to allow for cleanup * before the transaction is restarted. * * The return value is < 0 in case of error, 0 in case the handle has enough * credits or transaction extension succeeded, 1 in case transaction had to be * restarted. */ #define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \ revoke_cred, fn) \ ({ \ __label__ __ensure_end; \ int err = __ext4_journal_ensure_credits((handle), (check_cred), \ (extend_cred), (revoke_cred)); \ \ if (err <= 0) \ goto __ensure_end; \ err = (fn); \ if (err < 0) \ goto __ensure_end; \ err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \ if (err == 0) \ err = 1; \ __ensure_end: \ err; \ }) /* * Ensure given handle has at least requested amount of credits available, * possibly restarting transaction if needed. We also make sure the transaction * has space for at least ext4_trans_default_revoke_credits(sb) revoke records * as freeing one or two blocks is very common pattern and requesting this is * very cheap. */ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, int revoke_creds) { return ext4_journal_ensure_credits_fn(handle, credits, credits, revoke_creds, 0); } static inline int ext4_journal_blocks_per_page(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) return jbd2_journal_blocks_per_page(inode); return 0; } static inline int ext4_journal_force_commit(journal_t *journal) { if (journal) return jbd2_journal_force_commit(journal); return 0; } static inline int ext4_jbd2_inode_add_write(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { if (ext4_handle_valid(handle)) return jbd2_journal_inode_ranged_write(handle, EXT4_I(inode)->jinode, start_byte, length); return 0; } static inline int ext4_jbd2_inode_add_wait(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { if (ext4_handle_valid(handle)) return jbd2_journal_inode_ranged_wait(handle, EXT4_I(inode)->jinode, start_byte, length); return 0; } static inline void ext4_update_inode_fsync_trans(handle_t *handle, struct inode *inode, int datasync) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; if (datasync) ei->i_datasync_tid = handle->h_transaction->t_tid; } } /* super.c */ int ext4_force_commit(struct super_block *sb); /* * Ext4 inode journal modes */ #define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ #define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ #define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ int ext4_inode_journal_mode(struct inode *inode); static inline int ext4_should_journal_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; } static inline int ext4_should_order_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; } static inline int ext4_should_writeback_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) { if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) return 0; if (!ext4_should_journal_data(inode)) return 0; /* * Data blocks in one extent are contiguous, just account for partial * clusters at extent boundaries */ return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1); } /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking * i_rwsem for direct I/O reads. This only works for extent-based * files, and it doesn't work if data journaling is enabled, since the * dioread_nolock code uses b_private to pass information back to the * I/O completion handler, and this conflicts with the jbd's use of * b_private. */ static inline int ext4_should_dioread_nolock(struct inode *inode) { if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) return 0; if (!S_ISREG(inode->i_mode)) return 0; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return 0; if (ext4_should_journal_data(inode)) return 0; /* temporary fix to prevent generic/422 test failures */ if (!test_opt(inode->i_sb, DELALLOC)) return 0; return 1; } #endif /* _EXT4_JBD2_H */
6 40 88 2028 24 86 1356 1358 1357 46 1346 1358 870 230 137 365 680 34 340 33 136 156 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ADDRCONF_H #define _ADDRCONF_H #define MAX_RTR_SOLICITATIONS -1 /* unlimited */ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ #define TEMP_VALID_LIFETIME (7*86400) /* 1 week */ #define TEMP_PREFERRED_LIFETIME (86400) /* 24 hours */ #define REGEN_MIN_ADVANCE (2) /* 2 seconds */ #define REGEN_MAX_RETRY (3) #define MAX_DESYNC_FACTOR (600) #define ADDR_CHECK_FREQUENCY (120*HZ) #define IPV6_MAX_ADDRESSES 16 #define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ / 50 : 1) #define ADDRCONF_TIMER_FUZZ (HZ / 4) #define ADDRCONF_TIMER_FUZZ_MAX (HZ) #define ADDRCONF_NOTIFY_PRIORITY 0 #include <linux/in.h> #include <linux/in6.h> struct prefix_info { __u8 type; __u8 length; __u8 prefix_len; union __packed { __u8 flags; struct __packed { #if defined(__BIG_ENDIAN_BITFIELD) __u8 onlink : 1, autoconf : 1, routeraddr : 1, preferpd : 1, reserved : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) __u8 reserved : 4, preferpd : 1, routeraddr : 1, autoconf : 1, onlink : 1; #else #error "Please fix <asm/byteorder.h>" #endif }; }; __be32 valid; __be32 prefered; __be32 reserved2; struct in6_addr prefix; }; /* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */ static_assert(sizeof(struct prefix_info) == 32); #include <linux/ipv6.h> #include <linux/netdevice.h> #include <net/if_inet6.h> #include <net/ipv6.h> struct in6_validator_info { struct in6_addr i6vi_addr; struct inet6_dev *i6vi_dev; struct netlink_ext_ack *extack; }; struct ifa6_config { const struct in6_addr *pfx; unsigned int plen; u8 ifa_proto; const struct in6_addr *peer_pfx; u32 rt_priority; u32 ifa_flags; u32 preferred_lft; u32 valid_lft; u16 scope; }; int addrconf_init(void); void addrconf_cleanup(void); int addrconf_add_ifaddr(struct net *net, void __user *arg); int addrconf_del_ifaddr(struct net *net, void __user *arg); int addrconf_set_dstaddr(struct net *net, void __user *arg); int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, const struct net_device *dev, bool skip_dev_check, int strict, u32 banned_flags); #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr); #endif int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs, unsigned char nsegs); bool ipv6_chk_custom_prefix(const struct in6_addr *addr, const unsigned int prefix_len, struct net_device *dev); int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev); struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr, struct net_device *dev); struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict); int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr, u32 flags); int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, const struct prefix_info *pinfo, struct inet6_dev *in6_dev, const struct in6_addr *addr, int addr_type, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft); static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr) { memcpy(eui, addr, 3); eui[3] = 0xFF; eui[4] = 0xFE; memcpy(eui + 5, addr + 3, 3); } static inline void addrconf_addr_eui48(u8 *eui, const char *const addr) { addrconf_addr_eui48_base(eui, addr); eui[0] ^= 2; } static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->addr_len != ETH_ALEN) return -1; /* * The zSeries OSA network cards can be shared among various * OS instances, but the OSA cards have only one MAC address. * This leads to duplicate address conflicts in conjunction * with IPv6 if more than one instance uses the same card. * * The driver for these cards can deliver a unique 16-bit * identifier for each instance sharing the same card. It is * placed instead of 0xFFFE in the interface identifier. The * "u" bit of the interface identifier is not inverted in this * case. Hence the resulting interface identifier has local * scope according to RFC2373. */ addrconf_addr_eui48_base(eui, dev->dev_addr); if (dev->dev_id) { eui[3] = (dev->dev_id >> 8) & 0xFF; eui[4] = dev->dev_id & 0xFF; } else { eui[0] ^= 2; } return 0; } #define INFINITY_LIFE_TIME 0xFFFFFFFF static inline unsigned long addrconf_timeout_fixup(u32 timeout, unsigned int unit) { if (timeout == INFINITY_LIFE_TIME) return ~0UL; /* * Avoid arithmetic overflow. * Assuming unit is constant and non-zero, this "if" statement * will go away on 64bit archs. */ if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit) return LONG_MAX / unit; return timeout; } static inline int addrconf_finite_timeout(unsigned long timeout) { return ~timeout; } /* * IPv6 Address Label subsystem (addrlabel.c) */ int ipv6_addr_label_init(void); void ipv6_addr_label_cleanup(void); int ipv6_addr_label_rtnl_register(void); u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex); /* * multicast prototypes (mcast.c) */ static inline bool ipv6_mc_may_pull(struct sk_buff *skb, unsigned int len) { if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len) return false; return pskb_may_pull(skb, len); } int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_mc_close(struct sock *sk); void ipv6_sock_mc_close(struct sock *sk); bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr); int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr); int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr); int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr); void ipv6_mc_up(struct inet6_dev *idev); void ipv6_mc_down(struct inet6_dev *idev); void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); int ipv6_mc_check_mld(struct sk_buff *skb); void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); void ipv6_mc_dad_complete(struct inet6_dev *idev); /* * identify MLD packets for MLD filter exceptions */ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset) { struct icmp6hdr *hdr; if (nexthdr != IPPROTO_ICMPV6 || !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr))) return false; hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset); switch (hdr->icmp6_type) { case ICMPV6_MGM_QUERY: case ICMPV6_MGM_REPORT: case ICMPV6_MGM_REDUCTION: case ICMPV6_MLD2_REPORT: return true; default: break; } return false; } void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao); /* * anycast prototypes (anycast.c) */ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_ac_close(struct sock *sk); void ipv6_sock_ac_close(struct sock *sk); int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr); int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr); void ipv6_ac_destroy_dev(struct inet6_dev *idev); bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr); bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, const struct in6_addr *addr); int ipv6_anycast_init(void); void ipv6_anycast_cleanup(void); /* Device notifier */ int register_inet6addr_notifier(struct notifier_block *nb); int unregister_inet6addr_notifier(struct notifier_block *nb); int inet6addr_notifier_call_chain(unsigned long val, void *v); int register_inet6addr_validator_notifier(struct notifier_block *nb); int unregister_inet6addr_validator_notifier(struct notifier_block *nb); int inet6addr_validator_notifier_call_chain(unsigned long val, void *v); void inet6_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv6_devconf *devconf); /** * __in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) { return rcu_dereference_rtnl(dev->ip6_ptr); } /** * __in6_dev_stats_get - get inet6_dev pointer for stats * @dev: network device * @skb: skb for original incoming interface if needed * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev, const struct sk_buff *skb) { if (netif_is_l3_master(dev)) dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb)); return __in6_dev_get(dev); } /** * __in6_dev_get_safely - get inet6_dev pointer from netdevice * @dev: network device * * This is a safer version of __in6_dev_get */ static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev) { if (likely(dev)) return rcu_dereference_rtnl(dev->ip6_ptr); else return NULL; } /** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * This version can be used in any context, and takes a reference * on the inet6_dev. Callers must use in6_dev_put() later to * release this reference. */ static inline struct inet6_dev *in6_dev_get(const struct net_device *dev) { struct inet6_dev *idev; rcu_read_lock(); idev = rcu_dereference(dev->ip6_ptr); if (idev) refcount_inc(&idev->refcnt); rcu_read_unlock(); return idev; } static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev) { struct inet6_dev *idev = __in6_dev_get(dev); return idev ? idev->nd_parms : NULL; } void in6_dev_finish_destroy(struct inet6_dev *idev); static inline void in6_dev_put(struct inet6_dev *idev) { if (refcount_dec_and_test(&idev->refcnt)) in6_dev_finish_destroy(idev); } static inline void in6_dev_put_clear(struct inet6_dev **pidev) { struct inet6_dev *idev = *pidev; if (idev) { in6_dev_put(idev); *pidev = NULL; } } static inline void __in6_dev_put(struct inet6_dev *idev) { refcount_dec(&idev->refcnt); } static inline void in6_dev_hold(struct inet6_dev *idev) { refcount_inc(&idev->refcnt); } /* called with rcu_read_lock held */ static inline bool ip6_ignore_linkdown(const struct net_device *dev) { const struct inet6_dev *idev = __in6_dev_get(dev); if (unlikely(!idev)) return true; return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown); } void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); static inline void in6_ifa_put(struct inet6_ifaddr *ifp) { if (refcount_dec_and_test(&ifp->refcnt)) inet6_ifa_finish_destroy(ifp); } static inline void __in6_ifa_put(struct inet6_ifaddr *ifp) { refcount_dec(&ifp->refcnt); } static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) { refcount_inc(&ifp->refcnt); } static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp) { return refcount_inc_not_zero(&ifp->refcnt); } /* * compute link-local solicited-node multicast address */ static inline void addrconf_addr_solict_mult(const struct in6_addr *addr, struct in6_addr *solicited) { ipv6_addr_set(solicited, htonl(0xFF020000), 0, htonl(0x1), htonl(0xFF000000) | addr->s6_addr32[3]); } static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0; #endif } static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0; #endif } static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr) { return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE); } static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) & cpu_to_be64(0xffffffffff000000UL))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x00000001)) | (addr->s6_addr[12] ^ 0xff)) == 0; #endif } static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(0x6a))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0; #endif } #ifdef CONFIG_PROC_FS int if6_proc_init(void); void if6_proc_exit(void); #endif #endif
12 34 12 1 20 3 1 2 20 3 1 41 41 41 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GRO_H #define _NET_GRO_H #include <linux/indirect_call_wrapper.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <linux/skbuff.h> #include <net/udp.h> #include <net/hotdata.h> struct napi_gro_cb { union { struct { /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ void *frag0; /* Length of frag0. */ unsigned int frag0_len; }; struct { /* used in skb_gro_receive() slow path */ struct sk_buff *last; /* jiffies when first packet was created/queued */ unsigned long age; }; }; /* This indicates where we are processing relative to skb->data. */ int data_offset; /* This is non-zero if the packet cannot be merged with the new skb. */ u16 flush; /* Number of segments aggregated. */ u16 count; /* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */ u16 proto; u16 pad; /* Used in napi_gro_cb::free */ #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 /* portion of the cb set to zero at every gro iteration */ struct_group(zeroed, /* Start offset for remote checksum offload */ u16 gro_remcsum_start; /* This is non-zero if the packet may be of the same flow. */ u8 same_flow:1; /* Used in tunnel GRO receive */ u8 encap_mark:1; /* GRO checksum is valid */ u8 csum_valid:1; /* Number of checksums via CHECKSUM_UNNECESSARY */ u8 csum_cnt:3; /* Free the skb? */ u8 free:2; /* Used in foo-over-udp, set in udp[46]_gro_receive */ u8 is_ipv6:1; /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1; /* Used to determine if ipid_offset can be ignored */ u8 ip_fixedid:1; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; /* GRO is done by frag_list pointer chaining. */ u8 is_flist:1; ); /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; /* L3 offsets */ union { struct { u16 network_offset; u16 inner_network_offset; }; u16 network_offsets[2]; }; }; #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) #define GRO_RECURSION_LIMIT 15 static inline int gro_recursion_inc_test(struct sk_buff *skb) { return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; } typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); static inline struct sk_buff *call_gro_receive(gro_receive_t cb, struct list_head *head, struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; return NULL; } return cb(head, skb); } typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, struct sk_buff *); static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, struct sock *sk, struct list_head *head, struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; return NULL; } return cb(sk, head, skb); } static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { return NAPI_GRO_CB(skb)->data_offset; } static inline unsigned int skb_gro_len(const struct sk_buff *skb) { return skb->len - NAPI_GRO_CB(skb)->data_offset; } static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) { NAPI_GRO_CB(skb)->data_offset += len; } static inline void *skb_gro_header_fast(const struct sk_buff *skb, unsigned int offset) { return NAPI_GRO_CB(skb)->frag0 + offset; } static inline bool skb_gro_may_pull(const struct sk_buff *skb, unsigned int hlen) { return likely(hlen <= NAPI_GRO_CB(skb)->frag0_len); } static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { if (!pskb_may_pull(skb, hlen)) return NULL; return skb->data + offset; } static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { void *ptr; ptr = skb_gro_header_fast(skb, offset); if (!skb_gro_may_pull(skb, hlen)) ptr = skb_gro_header_slow(skb, hlen, offset); return ptr; } static inline int skb_gro_receive_network_offset(const struct sk_buff *skb) { return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark]; } static inline void *skb_gro_network_header(const struct sk_buff *skb) { if (skb_gro_may_pull(skb, skb_gro_offset(skb))) return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb)); return skb->data + skb_gro_receive_network_offset(skb); } static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb, int proto) { const struct iphdr *iph = skb_gro_network_header(skb); return csum_tcpudp_nofold(iph->saddr, iph->daddr, skb_gro_len(skb), proto, 0); } static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (NAPI_GRO_CB(skb)->csum_valid) NAPI_GRO_CB(skb)->csum = wsum_negate(csum_partial(start, len, wsum_negate(NAPI_GRO_CB(skb)->csum))); } /* GRO checksum functions. These are logical equivalents of the normal * checksum functions (in skbuff.h) except that they operate on the GRO * offsets and fields in sk_buff. */ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb); static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) { return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); } static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, bool zero_okay, __sum16 check) { return ((skb->ip_summed != CHECKSUM_PARTIAL || skb_checksum_start_offset(skb) < skb_gro_offset(skb)) && !skb_at_gro_remcsum_start(skb) && NAPI_GRO_CB(skb)->csum_cnt == 0 && (!zero_okay || check)); } static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, __wsum psum) { if (NAPI_GRO_CB(skb)->csum_valid && !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) return 0; NAPI_GRO_CB(skb)->csum = psum; return __skb_gro_checksum_complete(skb); } static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->csum_cnt > 0) { /* Consume a checksum from CHECKSUM_UNNECESSARY */ NAPI_GRO_CB(skb)->csum_cnt--; } else { /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we * verified a new top level checksum or an encapsulated one * during GRO. This saves work if we fallback to normal path. */ __skb_incr_checksum_unnecessary(skb); } } #define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ compute_pseudo) \ ({ \ __sum16 __ret = 0; \ if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_gro_checksum_validate_complete(skb, \ compute_pseudo(skb, proto)); \ if (!__ret) \ skb_gro_incr_csum_unnecessary(skb); \ __ret; \ }) #define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) #define skb_gro_checksum_validate_zero_check(skb, proto, check, \ compute_pseudo) \ __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) #define skb_gro_checksum_simple_validate(skb) \ __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) { return (NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid); } static inline void __skb_gro_checksum_convert(struct sk_buff *skb, __wsum pseudo) { NAPI_GRO_CB(skb)->csum = ~pseudo; NAPI_GRO_CB(skb)->csum_valid = 1; } #define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \ do { \ if (__skb_gro_checksum_convert_check(skb)) \ __skb_gro_checksum_convert(skb, \ compute_pseudo(skb, proto)); \ } while (0) struct gro_remcsum { int offset; __wsum delta; }; static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) { grc->offset = 0; grc->delta = 0; } static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, unsigned int off, size_t hdrlen, int start, int offset, struct gro_remcsum *grc, bool nopartial) { __wsum delta; size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); if (!nopartial) { NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; return ptr; } ptr = skb_gro_header(skb, off + plen, off); if (!ptr) return NULL; delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, start, offset); /* Adjust skb->csum since we changed the packet */ NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); grc->offset = off + hdrlen + offset; grc->delta = delta; return ptr; } static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, struct gro_remcsum *grc) { void *ptr; size_t plen = grc->offset + sizeof(u16); if (!grc->delta) return; ptr = skb_gro_header(skb, plen, grc->offset); if (!ptr) return; remcsum_unadjust((__sum16 *)ptr, grc->delta); } #ifdef CONFIG_XFRM_OFFLOAD static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, struct sk_buff *pp, int flush, struct gro_remcsum *grc) { if (PTR_ERR(pp) != -EINPROGRESS) { NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, grc); skb->remcsum_offload = 0; } } #else static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, struct sk_buff *pp, int flush, struct gro_remcsum *grc) { NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, grc); skb->remcsum_offload = 0; } #endif INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); #define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_INET(cb, f2, f1, head, skb); \ }) struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { struct udphdr *uh; unsigned int hlen, off; off = skb_gro_offset(skb); hlen = off + sizeof(*uh); uh = skb_gro_header(skb, hlen, off); return uh; } static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, int proto) { const struct ipv6hdr *iph = skb_gro_network_header(skb); return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr, skb_gro_len(skb), proto, 0)); } static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, struct sk_buff *p, bool outer) { const u32 id = ntohl(*(__be32 *)&iph->id); const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; const u32 df = id & IP_DF; int flush; /* All fields must match except length and checksum. */ flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF)); if (flush | (outer && df)) return flush; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ if (count == 1 && df && !ipid_offset) NAPI_GRO_CB(p)->ip_fixedid = true; return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid); } static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) { /* <Version:4><Traffic_Class:8><Flow_Label:20> */ __be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* Flush if Traffic Class fields are different. */ return !!((first_word & htonl(0x0FF00000)) | (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); } static inline int __gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p, const u16 diff, bool outer) { const void *nh = th - diff; const void *nh2 = th2 - diff; if (((struct iphdr *)nh)->version == 6) return ipv6_gro_flush(nh, nh2); else return inet_gro_flush(nh, nh2, p, outer); } static inline int gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p) { const bool encap_mark = NAPI_GRO_CB(p)->encap_mark; int off = skb_transport_offset(p); int flush; flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark); if (encap_mark) flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false); return flush; } int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ static inline void gro_normal_list(struct napi_struct *napi) { if (!napi->rx_count) return; netif_receive_skb_list_internal(&napi->rx_list); INIT_LIST_HEAD(&napi->rx_list); napi->rx_count = 0; } /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) { list_add_tail(&skb->list, &napi->rx_list); napi->rx_count += segs; if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) gro_normal_list(napi); } /* This function is the alternative of 'inet_iif' and 'inet_sdif' * functions in case we can not rely on fields of IPCB. * * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. * The caller must hold the RCU read lock. */ static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) { *iif = inet_iif(skb) ?: skb->dev->ifindex; *sdif = 0; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (netif_is_l3_slave(skb->dev)) { struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); *sdif = *iif; *iif = master ? master->ifindex : 0; } #endif } /* This function is the alternative of 'inet6_iif' and 'inet6_sdif' * functions in case we can not rely on fields of IP6CB. * * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. * The caller must hold the RCU read lock. */ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) { /* using skb->dev->ifindex because skb_dst(skb) is not initialized */ *iif = skb->dev->ifindex; *sdif = 0; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (netif_is_l3_slave(skb->dev)) { struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); *sdif = *iif; *iif = master ? master->ifindex : 0; } #endif } struct packet_offload *gro_find_receive_by_type(__be16 type); struct packet_offload *gro_find_complete_by_type(__be16 type); #endif /* _NET_GRO_H */
3 3 3 4 4 4 4 6 5 1 4 9 2 2 2 2 2 2 2 4 5 5 4 4 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 // SPDX-License-Identifier: GPL-2.0 /* * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2021 Intel Corporation * Copyright 2023 NXP */ #include <linux/property.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> #include "hci_codec.h" #include "hci_debugfs.h" #include "smp.h" #include "eir.h" #include "msft.h" #include "aosp.h" #include "leds.h" static void hci_cmd_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, struct sk_buff *skb) { bt_dev_dbg(hdev, "result 0x%2.2x", result); if (hdev->req_status != HCI_REQ_PEND) return; hdev->req_result = result; hdev->req_status = HCI_REQ_DONE; /* Free the request command so it is not used as response */ kfree_skb(hdev->req_skb); hdev->req_skb = NULL; if (skb) { struct sock *sk = hci_skb_sk(skb); /* Drop sk reference if set */ if (sk) sock_put(sk); hdev->req_rsp = skb_get(skb); } wake_up_interruptible(&hdev->req_wait_q); } struct sk_buff *hci_cmd_sync_alloc(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, struct sock *sk) { int len = HCI_COMMAND_HDR_SIZE + plen; struct hci_command_hdr *hdr; struct sk_buff *skb; skb = bt_skb_alloc(len, GFP_ATOMIC); if (!skb) return NULL; hdr = skb_put(skb, HCI_COMMAND_HDR_SIZE); hdr->opcode = cpu_to_le16(opcode); hdr->plen = plen; if (plen) skb_put_data(skb, param, plen); bt_dev_dbg(hdev, "skb len %d", skb->len); hci_skb_pkt_type(skb) = HCI_COMMAND_PKT; hci_skb_opcode(skb) = opcode; /* Grab a reference if command needs to be associated with a sock (e.g. * likely mgmt socket that initiated the command). */ if (sk) { hci_skb_sk(skb) = sk; sock_hold(sk); } return skb; } static void hci_cmd_sync_add(struct hci_request *req, u16 opcode, u32 plen, const void *param, u8 event, struct sock *sk) { struct hci_dev *hdev = req->hdev; struct sk_buff *skb; bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); /* If an error occurred during request building, there is no point in * queueing the HCI command. We can simply return. */ if (req->err) return; skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, sk); if (!skb) { bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", opcode); req->err = -ENOMEM; return; } if (skb_queue_empty(&req->cmd_q)) bt_cb(skb)->hci.req_flags |= HCI_REQ_START; hci_skb_event(skb) = event; skb_queue_tail(&req->cmd_q, skb); } static int hci_req_sync_run(struct hci_request *req) { struct hci_dev *hdev = req->hdev; struct sk_buff *skb; unsigned long flags; bt_dev_dbg(hdev, "length %u", skb_queue_len(&req->cmd_q)); /* If an error occurred during request building, remove all HCI * commands queued on the HCI request queue. */ if (req->err) { skb_queue_purge(&req->cmd_q); return req->err; } /* Do not allow empty requests */ if (skb_queue_empty(&req->cmd_q)) return -ENODATA; skb = skb_peek_tail(&req->cmd_q); bt_cb(skb)->hci.req_complete_skb = hci_cmd_sync_complete; bt_cb(skb)->hci.req_flags |= HCI_REQ_SKB; spin_lock_irqsave(&hdev->cmd_q.lock, flags); skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q); spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); queue_work(hdev->workqueue, &hdev->cmd_work); return 0; } static void hci_request_init(struct hci_request *req, struct hci_dev *hdev) { skb_queue_head_init(&req->cmd_q); req->hdev = hdev; req->err = 0; } /* This function requires the caller holds hdev->req_lock. */ struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout, struct sock *sk) { struct hci_request req; struct sk_buff *skb; int err = 0; bt_dev_dbg(hdev, "Opcode 0x%4.4x", opcode); hci_request_init(&req, hdev); hci_cmd_sync_add(&req, opcode, plen, param, event, sk); hdev->req_status = HCI_REQ_PEND; err = hci_req_sync_run(&req); if (err < 0) return ERR_PTR(err); err = wait_event_interruptible_timeout(hdev->req_wait_q, hdev->req_status != HCI_REQ_PEND, timeout); if (err == -ERESTARTSYS) return ERR_PTR(-EINTR); switch (hdev->req_status) { case HCI_REQ_DONE: err = -bt_to_errno(hdev->req_result); break; case HCI_REQ_CANCELED: err = -hdev->req_result; break; default: err = -ETIMEDOUT; break; } hdev->req_status = 0; hdev->req_result = 0; skb = hdev->req_rsp; hdev->req_rsp = NULL; bt_dev_dbg(hdev, "end: err %d", err); if (err < 0) { kfree_skb(skb); return ERR_PTR(err); } /* If command return a status event skb will be set to NULL as there are * no parameters. */ if (!skb) return ERR_PTR(-ENODATA); return skb; } EXPORT_SYMBOL(__hci_cmd_sync_sk); /* This function requires the caller holds hdev->req_lock. */ struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { return __hci_cmd_sync_sk(hdev, opcode, plen, param, 0, timeout, NULL); } EXPORT_SYMBOL(__hci_cmd_sync); /* Send HCI command and wait for command complete event */ struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { struct sk_buff *skb; if (!test_bit(HCI_UP, &hdev->flags)) return ERR_PTR(-ENETDOWN); bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); hci_req_sync_lock(hdev); skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout); hci_req_sync_unlock(hdev); return skb; } EXPORT_SYMBOL(hci_cmd_sync); /* This function requires the caller holds hdev->req_lock. */ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout) { return __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, NULL); } EXPORT_SYMBOL(__hci_cmd_sync_ev); /* This function requires the caller holds hdev->req_lock. */ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout, struct sock *sk) { struct sk_buff *skb; u8 status; skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk); /* If command return a status event, skb will be set to -ENODATA */ if (skb == ERR_PTR(-ENODATA)) return 0; if (IS_ERR(skb)) { if (!event) bt_dev_err(hdev, "Opcode 0x%4.4x failed: %ld", opcode, PTR_ERR(skb)); return PTR_ERR(skb); } status = skb->data[0]; kfree_skb(skb); return status; } EXPORT_SYMBOL(__hci_cmd_sync_status_sk); int __hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { return __hci_cmd_sync_status_sk(hdev, opcode, plen, param, 0, timeout, NULL); } EXPORT_SYMBOL(__hci_cmd_sync_status); int hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { int err; hci_req_sync_lock(hdev); err = __hci_cmd_sync_status(hdev, opcode, plen, param, timeout); hci_req_sync_unlock(hdev); return err; } EXPORT_SYMBOL(hci_cmd_sync_status); static void hci_cmd_sync_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work); bt_dev_dbg(hdev, ""); /* Dequeue all entries and run them */ while (1) { struct hci_cmd_sync_work_entry *entry; mutex_lock(&hdev->cmd_sync_work_lock); entry = list_first_entry_or_null(&hdev->cmd_sync_work_list, struct hci_cmd_sync_work_entry, list); if (entry) list_del(&entry->list); mutex_unlock(&hdev->cmd_sync_work_lock); if (!entry) break; bt_dev_dbg(hdev, "entry %p", entry); if (entry->func) { int err; hci_req_sync_lock(hdev); err = entry->func(hdev, entry->data); if (entry->destroy) entry->destroy(hdev, entry->data, err); hci_req_sync_unlock(hdev); } kfree(entry); } } static void hci_cmd_sync_cancel_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_cancel_work); cancel_delayed_work_sync(&hdev->cmd_timer); cancel_delayed_work_sync(&hdev->ncmd_timer); atomic_set(&hdev->cmd_cnt, 1); wake_up_interruptible(&hdev->req_wait_q); } static int hci_scan_disable_sync(struct hci_dev *hdev); static int scan_disable_sync(struct hci_dev *hdev, void *data) { return hci_scan_disable_sync(hdev); } static int interleaved_inquiry_sync(struct hci_dev *hdev, void *data) { return hci_inquiry_sync(hdev, DISCOV_INTERLEAVED_INQUIRY_LEN, 0); } static void le_scan_disable(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, le_scan_disable.work); int status; bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) goto _return; status = hci_cmd_sync_queue(hdev, scan_disable_sync, NULL, NULL); if (status) { bt_dev_err(hdev, "failed to disable LE scan: %d", status); goto _return; } /* If we were running LE only scan, change discovery state. If * we were running both LE and BR/EDR inquiry simultaneously, * and BR/EDR inquiry is already finished, stop discovery, * otherwise BR/EDR inquiry will stop discovery when finished. * If we will resolve remote device name, do not change * discovery state. */ if (hdev->discovery.type == DISCOV_TYPE_LE) goto discov_stopped; if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED) goto _return; if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) { if (!test_bit(HCI_INQUIRY, &hdev->flags) && hdev->discovery.state != DISCOVERY_RESOLVING) goto discov_stopped; goto _return; } status = hci_cmd_sync_queue(hdev, interleaved_inquiry_sync, NULL, NULL); if (status) { bt_dev_err(hdev, "inquiry failed: status %d", status); goto discov_stopped; } goto _return; discov_stopped: hci_discovery_set_state(hdev, DISCOVERY_STOPPED); _return: hci_dev_unlock(hdev); } static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val, u8 filter_dup); static int reenable_adv_sync(struct hci_dev *hdev, void *data) { bt_dev_dbg(hdev, ""); if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) return 0; if (hdev->cur_adv_instance) { return hci_schedule_adv_instance_sync(hdev, hdev->cur_adv_instance, true); } else { if (ext_adv_capable(hdev)) { hci_start_ext_adv_sync(hdev, 0x00); } else { hci_update_adv_data_sync(hdev, 0x00); hci_update_scan_rsp_data_sync(hdev, 0x00); hci_enable_advertising_sync(hdev); } } return 0; } static void reenable_adv(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, reenable_adv_work); int status; bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); status = hci_cmd_sync_queue(hdev, reenable_adv_sync, NULL, NULL); if (status) bt_dev_err(hdev, "failed to reenable ADV: %d", status); hci_dev_unlock(hdev); } static void cancel_adv_timeout(struct hci_dev *hdev) { if (hdev->adv_instance_timeout) { hdev->adv_instance_timeout = 0; cancel_delayed_work(&hdev->adv_instance_expire); } } /* For a single instance: * - force == true: The instance will be removed even when its remaining * lifetime is not zero. * - force == false: the instance will be deactivated but kept stored unless * the remaining lifetime is zero. * * For instance == 0x00: * - force == true: All instances will be removed regardless of their timeout * setting. * - force == false: Only instances that have a timeout will be removed. */ int hci_clear_adv_instance_sync(struct hci_dev *hdev, struct sock *sk, u8 instance, bool force) { struct adv_info *adv_instance, *n, *next_instance = NULL; int err; u8 rem_inst; /* Cancel any timeout concerning the removed instance(s). */ if (!instance || hdev->cur_adv_instance == instance) cancel_adv_timeout(hdev); /* Get the next instance to advertise BEFORE we remove * the current one. This can be the same instance again * if there is only one instance. */ if (instance && hdev->cur_adv_instance == instance) next_instance = hci_get_next_instance(hdev, instance); if (instance == 0x00) { list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { if (!(force || adv_instance->timeout)) continue; rem_inst = adv_instance->instance; err = hci_remove_adv_instance(hdev, rem_inst); if (!err) mgmt_advertising_removed(sk, hdev, rem_inst); } } else { adv_instance = hci_find_adv_instance(hdev, instance); if (force || (adv_instance && adv_instance->timeout && !adv_instance->remaining_time)) { /* Don't advertise a removed instance. */ if (next_instance && next_instance->instance == instance) next_instance = NULL; err = hci_remove_adv_instance(hdev, instance); if (!err) mgmt_advertising_removed(sk, hdev, instance); } } if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) return 0; if (next_instance && !ext_adv_capable(hdev)) return hci_schedule_adv_instance_sync(hdev, next_instance->instance, false); return 0; } static int adv_timeout_expire_sync(struct hci_dev *hdev, void *data) { u8 instance = *(u8 *)data; kfree(data); hci_clear_adv_instance_sync(hdev, NULL, instance, false); if (list_empty(&hdev->adv_instances)) return hci_disable_advertising_sync(hdev); return 0; } static void adv_timeout_expire(struct work_struct *work) { u8 *inst_ptr; struct hci_dev *hdev = container_of(work, struct hci_dev, adv_instance_expire.work); bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); hdev->adv_instance_timeout = 0; if (hdev->cur_adv_instance == 0x00) goto unlock; inst_ptr = kmalloc(1, GFP_KERNEL); if (!inst_ptr) goto unlock; *inst_ptr = hdev->cur_adv_instance; hci_cmd_sync_queue(hdev, adv_timeout_expire_sync, inst_ptr, NULL); unlock: hci_dev_unlock(hdev); } static bool is_interleave_scanning(struct hci_dev *hdev) { return hdev->interleave_scan_state != INTERLEAVE_SCAN_NONE; } static int hci_passive_scan_sync(struct hci_dev *hdev); static void interleave_scan_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, interleave_scan.work); unsigned long timeout; if (hdev->interleave_scan_state == INTERLEAVE_SCAN_ALLOWLIST) { timeout = msecs_to_jiffies(hdev->advmon_allowlist_duration); } else if (hdev->interleave_scan_state == INTERLEAVE_SCAN_NO_FILTER) { timeout = msecs_to_jiffies(hdev->advmon_no_filter_duration); } else { bt_dev_err(hdev, "unexpected error"); return; } hci_passive_scan_sync(hdev); hci_dev_lock(hdev); switch (hdev->interleave_scan_state) { case INTERLEAVE_SCAN_ALLOWLIST: bt_dev_dbg(hdev, "next state: allowlist"); hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER; break; case INTERLEAVE_SCAN_NO_FILTER: bt_dev_dbg(hdev, "next state: no filter"); hdev->interleave_scan_state = INTERLEAVE_SCAN_ALLOWLIST; break; case INTERLEAVE_SCAN_NONE: bt_dev_err(hdev, "unexpected error"); } hci_dev_unlock(hdev); /* Don't continue interleaving if it was canceled */ if (is_interleave_scanning(hdev)) queue_delayed_work(hdev->req_workqueue, &hdev->interleave_scan, timeout); } void hci_cmd_sync_init(struct hci_dev *hdev) { INIT_WORK(&hdev->cmd_sync_work, hci_cmd_sync_work); INIT_LIST_HEAD(&hdev->cmd_sync_work_list); mutex_init(&hdev->cmd_sync_work_lock); mutex_init(&hdev->unregister_lock); INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work); INIT_WORK(&hdev->reenable_adv_work, reenable_adv); INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable); INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire); INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work); } static void _hci_cmd_sync_cancel_entry(struct hci_dev *hdev, struct hci_cmd_sync_work_entry *entry, int err) { if (entry->destroy) entry->destroy(hdev, entry->data, err); list_del(&entry->list); kfree(entry); } void hci_cmd_sync_clear(struct hci_dev *hdev) { struct hci_cmd_sync_work_entry *entry, *tmp; cancel_work_sync(&hdev->cmd_sync_work); cancel_work_sync(&hdev->reenable_adv_work); mutex_lock(&hdev->cmd_sync_work_lock); list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); mutex_unlock(&hdev->cmd_sync_work_lock); } void hci_cmd_sync_cancel(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = err; hdev->req_status = HCI_REQ_CANCELED; queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work); } } EXPORT_SYMBOL(hci_cmd_sync_cancel); /* Cancel ongoing command request synchronously: * * - Set result and mark status to HCI_REQ_CANCELED * - Wakeup command sync thread */ void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hdev->req_status == HCI_REQ_PEND) { /* req_result is __u32 so error must be positive to be properly * propagated. */ hdev->req_result = err < 0 ? -err : err; hdev->req_status = HCI_REQ_CANCELED; wake_up_interruptible(&hdev->req_wait_q); } } EXPORT_SYMBOL(hci_cmd_sync_cancel_sync); /* Submit HCI command to be run in as cmd_sync_work: * * - hdev must _not_ be unregistered */ int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; int err = 0; mutex_lock(&hdev->unregister_lock); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { err = -ENODEV; goto unlock; } entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { err = -ENOMEM; goto unlock; } entry->func = func; entry->data = data; entry->destroy = destroy; mutex_lock(&hdev->cmd_sync_work_lock); list_add_tail(&entry->list, &hdev->cmd_sync_work_list); mutex_unlock(&hdev->cmd_sync_work_lock); queue_work(hdev->req_workqueue, &hdev->cmd_sync_work); unlock: mutex_unlock(&hdev->unregister_lock); return err; } EXPORT_SYMBOL(hci_cmd_sync_submit); /* Queue HCI command: * * - hdev must be running */ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { /* Only queue command if hdev is running which means it had been opened * and is either on init phase or is already up. */ if (!test_bit(HCI_RUNNING, &hdev->flags)) return -ENETDOWN; return hci_cmd_sync_submit(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_queue); static struct hci_cmd_sync_work_entry * _hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) { if (func && entry->func != func) continue; if (data && entry->data != data) continue; if (destroy && entry->destroy != destroy) continue; return entry; } return NULL; } /* Queue HCI command entry once: * * - Lookup if an entry already exist and only if it doesn't creates a new entry * and queue it. */ int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy)) return 0; return hci_cmd_sync_queue(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_queue_once); /* Run HCI command: * * - hdev must be running * - if on cmd_sync_work then run immediately otherwise queue */ int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { /* Only queue command if hdev is running which means it had been opened * and is either on init phase or is already up. */ if (!test_bit(HCI_RUNNING, &hdev->flags)) return -ENETDOWN; /* If on cmd_sync_work then run immediately otherwise queue */ if (current_work() == &hdev->cmd_sync_work) return func(hdev, data); return hci_cmd_sync_submit(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_run); /* Run HCI command entry once: * * - Lookup if an entry already exist and only if it doesn't creates a new entry * and run it. * - if on cmd_sync_work then run immediately otherwise queue */ int hci_cmd_sync_run_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy)) return 0; return hci_cmd_sync_run(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_run_once); /* Lookup HCI command entry: * * - Return first entry that matches by function callback or data or * destroy callback. */ struct hci_cmd_sync_work_entry * hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; mutex_lock(&hdev->cmd_sync_work_lock); entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy); mutex_unlock(&hdev->cmd_sync_work_lock); return entry; } EXPORT_SYMBOL(hci_cmd_sync_lookup_entry); /* Cancel HCI command entry */ void hci_cmd_sync_cancel_entry(struct hci_dev *hdev, struct hci_cmd_sync_work_entry *entry) { mutex_lock(&hdev->cmd_sync_work_lock); _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); mutex_unlock(&hdev->cmd_sync_work_lock); } EXPORT_SYMBOL(hci_cmd_sync_cancel_entry); /* Dequeue one HCI command entry: * * - Lookup and cancel first entry that matches. */ bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; entry = hci_cmd_sync_lookup_entry(hdev, func, data, destroy); if (!entry) return false; hci_cmd_sync_cancel_entry(hdev, entry); return true; } EXPORT_SYMBOL(hci_cmd_sync_dequeue_once); /* Dequeue HCI command entry: * * - Lookup and cancel any entry that matches by function callback or data or * destroy callback. */ bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; bool ret = false; mutex_lock(&hdev->cmd_sync_work_lock); while ((entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy))) { _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); ret = true; } mutex_unlock(&hdev->cmd_sync_work_lock); return ret; } EXPORT_SYMBOL(hci_cmd_sync_dequeue); int hci_update_eir_sync(struct hci_dev *hdev) { struct hci_cp_write_eir cp; bt_dev_dbg(hdev, ""); if (!hdev_is_powered(hdev)) return 0; if (!lmp_ext_inq_capable(hdev)) return 0; if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return 0; if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) return 0; memset(&cp, 0, sizeof(cp)); eir_create(hdev, cp.data); if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0) return 0; memcpy(hdev->eir, cp.data, sizeof(cp.data)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static u8 get_service_classes(struct hci_dev *hdev) { struct bt_uuid *uuid; u8 val = 0; list_for_each_entry(uuid, &hdev->uuids, list) val |= uuid->svc_hint; return val; } int hci_update_class_sync(struct hci_dev *hdev) { u8 cod[3]; bt_dev_dbg(hdev, ""); if (!hdev_is_powered(hdev)) return 0; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) return 0; cod[0] = hdev->minor_class; cod[1] = hdev->major_class; cod[2] = get_service_classes(hdev); if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) cod[1] |= 0x20; if (memcmp(cod, hdev->dev_class, 3) == 0) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod, HCI_CMD_TIMEOUT); } static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) { /* If there is no connection we are OK to advertise. */ if (hci_conn_num(hdev, LE_LINK) == 0) return true; /* Check le_states if there is any connection in peripheral role. */ if (hdev->conn_hash.le_num_peripheral > 0) { /* Peripheral connection state and non connectable mode * bit 20. */ if (!connectable && !(hdev->le_states[2] & 0x10)) return false; /* Peripheral connection state and connectable mode bit 38 * and scannable bit 21. */ if (connectable && (!(hdev->le_states[4] & 0x40) || !(hdev->le_states[2] & 0x20))) return false; } /* Check le_states if there is any connection in central role. */ if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_peripheral) { /* Central connection state and non connectable mode bit 18. */ if (!connectable && !(hdev->le_states[2] & 0x02)) return false; /* Central connection state and connectable mode bit 35 and * scannable 19. */ if (connectable && (!(hdev->le_states[4] & 0x08) || !(hdev->le_states[2] & 0x08))) return false; } return true; } static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) { /* If privacy is not enabled don't use RPA */ if (!hci_dev_test_flag(hdev, HCI_PRIVACY)) return false; /* If basic privacy mode is enabled use RPA */ if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) return true; /* If limited privacy mode is enabled don't use RPA if we're * both discoverable and bondable. */ if ((flags & MGMT_ADV_FLAG_DISCOV) && hci_dev_test_flag(hdev, HCI_BONDABLE)) return false; /* We're neither bondable nor discoverable in the limited * privacy mode, therefore use RPA. */ return true; } static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) { /* If we're advertising or initiating an LE connection we can't * go ahead and change the random address at this time. This is * because the eventual initiator address used for the * subsequently created connection will be undefined (some * controllers use the new address and others the one we had * when the operation started). * * In this kind of scenario skip the update and let the random * address be updated at the next cycle. */ if (hci_dev_test_flag(hdev, HCI_LE_ADV) || hci_lookup_le_connect(hdev)) { bt_dev_dbg(hdev, "Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return 0; } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa, HCI_CMD_TIMEOUT); } int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy, bool rpa, u8 *own_addr_type) { int err; /* If privacy is enabled use a resolvable private address. If * current RPA has expired or there is something else than * the current RPA in use, then generate a new one. */ if (rpa) { /* If Controller supports LL Privacy use own address type is * 0x03 */ if (use_ll_privacy(hdev)) *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; else *own_addr_type = ADDR_LE_DEV_RANDOM; /* Check if RPA is valid */ if (rpa_valid(hdev)) return 0; err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { bt_dev_err(hdev, "failed to generate new RPA"); return err; } err = hci_set_random_addr_sync(hdev, &hdev->rpa); if (err) return err; return 0; } /* In case of required privacy without resolvable private address, * use an non-resolvable private address. This is useful for active * scanning and non-connectable advertising. */ if (require_privacy) { bdaddr_t nrpa; while (true) { /* The non-resolvable private address is generated * from random six bytes with the two most significant * bits cleared. */ get_random_bytes(&nrpa, 6); nrpa.b[5] &= 0x3f; /* The non-resolvable private address shall not be * equal to the public address. */ if (bacmp(&hdev->bdaddr, &nrpa)) break; } *own_addr_type = ADDR_LE_DEV_RANDOM; return hci_set_random_addr_sync(hdev, &nrpa); } /* If forcing static address is in use or there is no public * address use the static address as random address (but skip * the HCI command if the current random address is already the * static one. * * In case BR/EDR has been disabled on a dual-mode controller * and a static address has been configured, then use that * address instead of the public BR/EDR address. */ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { *own_addr_type = ADDR_LE_DEV_RANDOM; if (bacmp(&hdev->static_addr, &hdev->random_addr)) return hci_set_random_addr_sync(hdev, &hdev->static_addr); return 0; } /* Neither privacy nor static address is being used so use a * public address. */ *own_addr_type = ADDR_LE_DEV_PUBLIC; return 0; } static int hci_disable_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_enable *cp; struct hci_cp_ext_adv_set *set; u8 data[sizeof(*cp) + sizeof(*set) * 1]; u8 size; struct adv_info *adv = NULL; /* If request specifies an instance that doesn't exist, fail */ if (instance > 0) { adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; /* If not enabled there is nothing to do */ if (!adv->enabled) return 0; } memset(data, 0, sizeof(data)); cp = (void *)data; set = (void *)cp->data; /* Instance 0x00 indicates all advertising instances will be disabled */ cp->num_of_sets = !!instance; cp->enable = 0x00; set->handle = adv ? adv->handle : instance; size = sizeof(*cp) + sizeof(*set) * cp->num_of_sets; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, size, data, HCI_CMD_TIMEOUT); } static int hci_set_adv_set_random_addr_sync(struct hci_dev *hdev, u8 instance, bdaddr_t *random_addr) { struct hci_cp_le_set_adv_set_rand_addr cp; int err; if (!instance) { /* Instance 0x00 doesn't have an adv_info, instead it uses * hdev->random_addr to track its address so whenever it needs * to be updated this also set the random address since * hdev->random_addr is shared with scan state machine. */ err = hci_set_random_addr_sync(hdev, random_addr); if (err) return err; } memset(&cp, 0, sizeof(cp)); cp.handle = instance; bacpy(&cp.bdaddr, random_addr); return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; bool connectable; u32 flags; bdaddr_t random_addr; u8 own_addr_type; int err; struct adv_info *adv; bool secondary_adv; if (instance > 0) { adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; } else { adv = NULL; } /* Updating parameters of an active instance will return a * Command Disallowed error, so we must first disable the * instance if it is active. */ if (adv && !adv->pending) { err = hci_disable_ext_adv_instance_sync(hdev, instance); if (err) return err; } flags = hci_adv_instance_flags(hdev, instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. */ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || mgmt_get_connectable(hdev); if (!is_advertising_allowed(hdev, connectable)) return -EPERM; /* Set require_privacy to true only when non-connectable * advertising is used. In that case it is fine to use a * non-resolvable private address. */ err = hci_get_random_address(hdev, !connectable, adv_use_rpa(hdev, flags), adv, &own_addr_type, &random_addr); if (err < 0) return err; memset(&cp, 0, sizeof(cp)); if (adv) { hci_cpu_to_le24(adv->min_interval, cp.min_interval); hci_cpu_to_le24(adv->max_interval, cp.max_interval); cp.tx_power = adv->tx_power; } else { hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval); hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval); cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE; } secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); if (connectable) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); } else if (hci_adv_instance_is_scannable(hdev, instance) || (flags & MGMT_ADV_PARAM_SCAN_RSP)) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND); } else { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); } /* If Own_Address_Type equals 0x02 or 0x03, the Peer_Address parameter * contains the peer’s Identity Address and the Peer_Address_Type * parameter contains the peer’s Identity Type (i.e., 0x00 or 0x01). * These parameters are used to locate the corresponding local IRK in * the resolving list; this IRK is used to generate their own address * used in the advertisement. */ if (own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED) hci_copy_identity_address(hdev, &cp.peer_addr, &cp.peer_addr_type); cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; cp.handle = adv ? adv->handle : instance; if (flags & MGMT_ADV_FLAG_SEC_2M) { cp.primary_phy = HCI_ADV_PHY_1M; cp.secondary_phy = HCI_ADV_PHY_2M; } else if (flags & MGMT_ADV_FLAG_SEC_CODED) { cp.primary_phy = HCI_ADV_PHY_CODED; cp.secondary_phy = HCI_ADV_PHY_CODED; } else { /* In all other cases use 1M */ cp.primary_phy = HCI_ADV_PHY_1M; cp.secondary_phy = HCI_ADV_PHY_1M; } err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) return err; if ((own_addr_type == ADDR_LE_DEV_RANDOM || own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED) && bacmp(&random_addr, BDADDR_ANY)) { /* Check if random address need to be updated */ if (adv) { if (!bacmp(&random_addr, &adv->random_addr)) return 0; } else { if (!bacmp(&random_addr, &hdev->random_addr)) return 0; } return hci_set_adv_set_random_addr_sync(hdev, instance, &random_addr); } return 0; } static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) { DEFINE_FLEX(struct hci_cp_le_set_ext_scan_rsp_data, pdu, data, length, HCI_MAX_EXT_AD_LENGTH); u8 len; struct adv_info *adv = NULL; int err; if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->scan_rsp_changed) return 0; } len = eir_create_scan_rsp(hdev, instance, pdu->data); pdu->handle = adv ? adv->handle : instance; pdu->length = len; pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG; err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); if (err) return err; if (adv) { adv->scan_rsp_changed = false; } else { memcpy(hdev->scan_rsp_data, pdu->data, len); hdev->scan_rsp_data_len = len; } return 0; } static int __hci_set_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_scan_rsp_data cp; u8 len; memset(&cp, 0, sizeof(cp)); len = eir_create_scan_rsp(hdev, instance, cp.data); if (hdev->scan_rsp_data_len == len && !memcmp(cp.data, hdev->scan_rsp_data, len)) return 0; memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); hdev->scan_rsp_data_len = len; cp.length = len; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_update_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) { if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; if (ext_adv_capable(hdev)) return hci_set_ext_scan_rsp_data_sync(hdev, instance); return __hci_set_scan_rsp_data_sync(hdev, instance); } int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_enable *cp; struct hci_cp_ext_adv_set *set; u8 data[sizeof(*cp) + sizeof(*set) * 1]; struct adv_info *adv; if (instance > 0) { adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; /* If already enabled there is nothing to do */ if (adv->enabled) return 0; } else { adv = NULL; } cp = (void *)data; set = (void *)cp->data; memset(cp, 0, sizeof(*cp)); cp->enable = 0x01; cp->num_of_sets = 0x01; memset(set, 0, sizeof(*set)); set->handle = adv ? adv->handle : instance; /* Set duration per instance since controller is responsible for * scheduling it. */ if (adv && adv->timeout) { u16 duration = adv->timeout * MSEC_PER_SEC; /* Time = N * 10 ms */ set->duration = cpu_to_le16(duration / 10); } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(*cp) + sizeof(*set) * cp->num_of_sets, data, HCI_CMD_TIMEOUT); } int hci_start_ext_adv_sync(struct hci_dev *hdev, u8 instance) { int err; err = hci_setup_ext_adv_instance_sync(hdev, instance); if (err) return err; err = hci_set_ext_scan_rsp_data_sync(hdev, instance); if (err) return err; return hci_enable_ext_advertising_sync(hdev, instance); } int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_per_adv_enable cp; struct adv_info *adv = NULL; /* If periodic advertising already disabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->periodic || !adv->enabled) return 0; memset(&cp, 0, sizeof(cp)); cp.enable = 0x00; cp.handle = instance; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_per_adv_params_sync(struct hci_dev *hdev, u8 instance, u16 min_interval, u16 max_interval) { struct hci_cp_le_set_per_adv_params cp; memset(&cp, 0, sizeof(cp)); if (!min_interval) min_interval = DISCOV_LE_PER_ADV_INT_MIN; if (!max_interval) max_interval = DISCOV_LE_PER_ADV_INT_MAX; cp.handle = instance; cp.min_interval = cpu_to_le16(min_interval); cp.max_interval = cpu_to_le16(max_interval); cp.periodic_properties = 0x0000; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_PARAMS, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance) { DEFINE_FLEX(struct hci_cp_le_set_per_adv_data, pdu, data, length, HCI_MAX_PER_AD_LENGTH); u8 len; struct adv_info *adv = NULL; if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->periodic) return 0; } len = eir_create_per_adv_data(hdev, instance, pdu->data); pdu->length = len; pdu->handle = adv ? adv->handle : instance; pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_DATA, struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); } static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_per_adv_enable cp; struct adv_info *adv = NULL; /* If periodic advertising already enabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); if (adv && adv->periodic && adv->enabled) return 0; memset(&cp, 0, sizeof(cp)); cp.enable = 0x01; cp.handle = instance; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Checks if periodic advertising data contains a Basic Announcement and if it * does generates a Broadcast ID and add Broadcast Announcement. */ static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv) { u8 bid[3]; u8 ad[4 + 3]; /* Skip if NULL adv as instance 0x00 is used for general purpose * advertising so it cannot used for the likes of Broadcast Announcement * as it can be overwritten at any point. */ if (!adv) return 0; /* Check if PA data doesn't contains a Basic Audio Announcement then * there is nothing to do. */ if (!eir_get_service_data(adv->per_adv_data, adv->per_adv_data_len, 0x1851, NULL)) return 0; /* Check if advertising data already has a Broadcast Announcement since * the process may want to control the Broadcast ID directly and in that * case the kernel shall no interfere. */ if (eir_get_service_data(adv->adv_data, adv->adv_data_len, 0x1852, NULL)) return 0; /* Generate Broadcast ID */ get_random_bytes(bid, sizeof(bid)); eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid)); hci_set_adv_instance_data(hdev, adv->instance, sizeof(ad), ad, 0, NULL); return hci_update_adv_data_sync(hdev, adv->instance); } int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, u8 *data, u32 flags, u16 min_interval, u16 max_interval, u16 sync_interval) { struct adv_info *adv = NULL; int err; bool added = false; hci_disable_per_advertising_sync(hdev, instance); if (instance) { adv = hci_find_adv_instance(hdev, instance); /* Create an instance if that could not be found */ if (!adv) { adv = hci_add_per_instance(hdev, instance, flags, data_len, data, sync_interval, sync_interval); if (IS_ERR(adv)) return PTR_ERR(adv); adv->pending = false; added = true; } } /* Start advertising */ err = hci_start_ext_adv_sync(hdev, instance); if (err < 0) goto fail; err = hci_adv_bcast_annoucement(hdev, adv); if (err < 0) goto fail; err = hci_set_per_adv_params_sync(hdev, instance, min_interval, max_interval); if (err < 0) goto fail; err = hci_set_per_adv_data_sync(hdev, instance); if (err < 0) goto fail; err = hci_enable_per_advertising_sync(hdev, instance); if (err < 0) goto fail; return 0; fail: if (added) hci_remove_adv_instance(hdev, instance); return err; } static int hci_start_adv_sync(struct hci_dev *hdev, u8 instance) { int err; if (ext_adv_capable(hdev)) return hci_start_ext_adv_sync(hdev, instance); err = hci_update_adv_data_sync(hdev, instance); if (err) return err; err = hci_update_scan_rsp_data_sync(hdev, instance); if (err) return err; return hci_enable_advertising_sync(hdev); } int hci_enable_advertising_sync(struct hci_dev *hdev) { struct adv_info *adv_instance; struct hci_cp_le_set_adv_param cp; u8 own_addr_type, enable = 0x01; bool connectable; u16 adv_min_interval, adv_max_interval; u32 flags; u8 status; if (ext_adv_capable(hdev)) return hci_enable_ext_advertising_sync(hdev, hdev->cur_adv_instance); flags = hci_adv_instance_flags(hdev, hdev->cur_adv_instance); adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. */ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || mgmt_get_connectable(hdev); if (!is_advertising_allowed(hdev, connectable)) return -EINVAL; status = hci_disable_advertising_sync(hdev); if (status) return status; /* Clear the HCI_LE_ADV bit temporarily so that the * hci_update_random_address knows that it's safe to go ahead * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ hci_dev_clear_flag(hdev, HCI_LE_ADV); /* Set require_privacy to true only when non-connectable * advertising is used. In that case it is fine to use a * non-resolvable private address. */ status = hci_update_random_address_sync(hdev, !connectable, adv_use_rpa(hdev, flags), &own_addr_type); if (status) return status; memset(&cp, 0, sizeof(cp)); if (adv_instance) { adv_min_interval = adv_instance->min_interval; adv_max_interval = adv_instance->max_interval; } else { adv_min_interval = hdev->le_adv_min_interval; adv_max_interval = hdev->le_adv_max_interval; } if (connectable) { cp.type = LE_ADV_IND; } else { if (hci_adv_instance_is_scannable(hdev, hdev->cur_adv_instance)) cp.type = LE_ADV_SCAN_IND; else cp.type = LE_ADV_NONCONN_IND; if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE) || hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN; adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX; } } cp.min_interval = cpu_to_le16(adv_min_interval); cp.max_interval = cpu_to_le16(adv_max_interval); cp.own_address_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; status = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (status) return status; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); } static int enable_advertising_sync(struct hci_dev *hdev, void *data) { return hci_enable_advertising_sync(hdev); } int hci_enable_advertising(struct hci_dev *hdev) { if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) return 0; return hci_cmd_sync_queue(hdev, enable_advertising_sync, NULL, NULL); } int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance, struct sock *sk) { int err; if (!ext_adv_capable(hdev)) return 0; err = hci_disable_ext_adv_instance_sync(hdev, instance); if (err) return err; /* If request specifies an instance that doesn't exist, fail */ if (instance > 0 && !hci_find_adv_instance(hdev, instance)) return -EINVAL; return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_REMOVE_ADV_SET, sizeof(instance), &instance, 0, HCI_CMD_TIMEOUT, sk); } static int remove_ext_adv_sync(struct hci_dev *hdev, void *data) { struct adv_info *adv = data; u8 instance = 0; if (adv) instance = adv->instance; return hci_remove_ext_adv_instance_sync(hdev, instance, NULL); } int hci_remove_ext_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv = NULL; if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; } return hci_cmd_sync_queue(hdev, remove_ext_adv_sync, adv, NULL); } int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason) { struct hci_cp_le_term_big cp; memset(&cp, 0, sizeof(cp)); cp.handle = handle; cp.reason = reason; return __hci_cmd_sync_status(hdev, HCI_OP_LE_TERM_BIG, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance) { DEFINE_FLEX(struct hci_cp_le_set_ext_adv_data, pdu, data, length, HCI_MAX_EXT_AD_LENGTH); u8 len; struct adv_info *adv = NULL; int err; if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->adv_data_changed) return 0; } len = eir_create_adv_data(hdev, instance, pdu->data); pdu->length = len; pdu->handle = adv ? adv->handle : instance; pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG; err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA, struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); if (err) return err; /* Update data if the command succeed */ if (adv) { adv->adv_data_changed = false; } else { memcpy(hdev->adv_data, pdu->data, len); hdev->adv_data_len = len; } return 0; } static int hci_set_adv_data_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_adv_data cp; u8 len; memset(&cp, 0, sizeof(cp)); len = eir_create_adv_data(hdev, instance, cp.data); /* There's nothing to do if the data hasn't changed */ if (hdev->adv_data_len == len && memcmp(cp.data, hdev->adv_data, len) == 0) return 0; memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); hdev->adv_data_len = len; cp.length = len; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_update_adv_data_sync(struct hci_dev *hdev, u8 instance) { if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; if (ext_adv_capable(hdev)) return hci_set_ext_adv_data_sync(hdev, instance); return hci_set_adv_data_sync(hdev, instance); } int hci_schedule_adv_instance_sync(struct hci_dev *hdev, u8 instance, bool force) { struct adv_info *adv = NULL; u16 timeout; if (hci_dev_test_flag(hdev, HCI_ADVERTISING) && !ext_adv_capable(hdev)) return -EPERM; if (hdev->adv_instance_timeout) return -EBUSY; adv = hci_find_adv_instance(hdev, instance); if (!adv) return -ENOENT; /* A zero timeout means unlimited advertising. As long as there is * only one instance, duration should be ignored. We still set a timeout * in case further instances are being added later on. * * If the remaining lifetime of the instance is more than the duration * then the timeout corresponds to the duration, otherwise it will be * reduced to the remaining instance lifetime. */ if (adv->timeout == 0 || adv->duration <= adv->remaining_time) timeout = adv->duration; else timeout = adv->remaining_time; /* The remaining time is being reduced unless the instance is being * advertised without time limit. */ if (adv->timeout) adv->remaining_time = adv->remaining_time - timeout; /* Only use work for scheduling instances with legacy advertising */ if (!ext_adv_capable(hdev)) { hdev->adv_instance_timeout = timeout; queue_delayed_work(hdev->req_workqueue, &hdev->adv_instance_expire, msecs_to_jiffies(timeout * 1000)); } /* If we're just re-scheduling the same instance again then do not * execute any HCI commands. This happens when a single instance is * being advertised. */ if (!force && hdev->cur_adv_instance == instance && hci_dev_test_flag(hdev, HCI_LE_ADV)) return 0; hdev->cur_adv_instance = instance; return hci_start_adv_sync(hdev, instance); } static int hci_clear_adv_sets_sync(struct hci_dev *hdev, struct sock *sk) { int err; if (!ext_adv_capable(hdev)) return 0; /* Disable instance 0x00 to disable all instances */ err = hci_disable_ext_adv_instance_sync(hdev, 0x00); if (err) return err; return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL, 0, HCI_CMD_TIMEOUT, sk); } static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force) { struct adv_info *adv, *n; int err = 0; if (ext_adv_capable(hdev)) /* Remove all existing sets */ err = hci_clear_adv_sets_sync(hdev, sk); if (ext_adv_capable(hdev)) return err; /* This is safe as long as there is no command send while the lock is * held. */ hci_dev_lock(hdev); /* Cleanup non-ext instances */ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) { u8 instance = adv->instance; int err; if (!(force || adv->timeout)) continue; err = hci_remove_adv_instance(hdev, instance); if (!err) mgmt_advertising_removed(sk, hdev, instance); } hci_dev_unlock(hdev); return 0; } static int hci_remove_adv_sync(struct hci_dev *hdev, u8 instance, struct sock *sk) { int err = 0; /* If we use extended advertising, instance has to be removed first. */ if (ext_adv_capable(hdev)) err = hci_remove_ext_adv_instance_sync(hdev, instance, sk); if (ext_adv_capable(hdev)) return err; /* This is safe as long as there is no command send while the lock is * held. */ hci_dev_lock(hdev); err = hci_remove_adv_instance(hdev, instance); if (!err) mgmt_advertising_removed(sk, hdev, instance); hci_dev_unlock(hdev); return err; } /* For a single instance: * - force == true: The instance will be removed even when its remaining * lifetime is not zero. * - force == false: the instance will be deactivated but kept stored unless * the remaining lifetime is zero. * * For instance == 0x00: * - force == true: All instances will be removed regardless of their timeout * setting. * - force == false: Only instances that have a timeout will be removed. */ int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk, u8 instance, bool force) { struct adv_info *next = NULL; int err; /* Cancel any timeout concerning the removed instance(s). */ if (!instance || hdev->cur_adv_instance == instance) cancel_adv_timeout(hdev); /* Get the next instance to advertise BEFORE we remove * the current one. This can be the same instance again * if there is only one instance. */ if (hdev->cur_adv_instance == instance) next = hci_get_next_instance(hdev, instance); if (!instance) { err = hci_clear_adv_sync(hdev, sk, force); if (err) return err; } else { struct adv_info *adv = hci_find_adv_instance(hdev, instance); if (force || (adv && adv->timeout && !adv->remaining_time)) { /* Don't advertise a removed instance. */ if (next && next->instance == instance) next = NULL; err = hci_remove_adv_sync(hdev, instance, sk); if (err) return err; } } if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) return 0; if (next && !ext_adv_capable(hdev)) hci_schedule_adv_instance_sync(hdev, next->instance, false); return 0; } int hci_read_rssi_sync(struct hci_dev *hdev, __le16 handle) { struct hci_cp_read_rssi cp; cp.handle = handle; return __hci_cmd_sync_status(hdev, HCI_OP_READ_RSSI, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_read_clock_sync(struct hci_dev *hdev, struct hci_cp_read_clock *cp) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_CLOCK, sizeof(*cp), cp, HCI_CMD_TIMEOUT); } int hci_read_tx_power_sync(struct hci_dev *hdev, __le16 handle, u8 type) { struct hci_cp_read_tx_power cp; cp.handle = handle; cp.type = type; return __hci_cmd_sync_status(hdev, HCI_OP_READ_TX_POWER, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_disable_advertising_sync(struct hci_dev *hdev) { u8 enable = 0x00; int err = 0; /* If controller is not advertising we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_ADV)) return 0; if (ext_adv_capable(hdev)) err = hci_disable_ext_adv_instance_sync(hdev, 0x00); if (ext_adv_capable(hdev)) return err; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); } static int hci_le_set_ext_scan_enable_sync(struct hci_dev *hdev, u8 val, u8 filter_dup) { struct hci_cp_le_set_ext_scan_enable cp; memset(&cp, 0, sizeof(cp)); cp.enable = val; if (hci_dev_test_flag(hdev, HCI_MESH)) cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE; else cp.filter_dup = filter_dup; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val, u8 filter_dup) { struct hci_cp_le_set_scan_enable cp; if (use_ext_scan(hdev)) return hci_le_set_ext_scan_enable_sync(hdev, val, filter_dup); memset(&cp, 0, sizeof(cp)); cp.enable = val; if (val && hci_dev_test_flag(hdev, HCI_MESH)) cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE; else cp.filter_dup = filter_dup; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_set_addr_resolution_enable_sync(struct hci_dev *hdev, u8 val) { if (!use_ll_privacy(hdev)) return 0; /* If controller is not/already resolving we are done. */ if (val == hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, sizeof(val), &val, HCI_CMD_TIMEOUT); } static int hci_scan_disable_sync(struct hci_dev *hdev) { int err; /* If controller is not scanning we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return 0; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); return 0; } err = hci_le_set_scan_enable_sync(hdev, LE_SCAN_DISABLE, 0x00); if (err) { bt_dev_err(hdev, "Unable to disable scanning: %d", err); return err; } return err; } static bool scan_use_rpa(struct hci_dev *hdev) { return hci_dev_test_flag(hdev, HCI_PRIVACY); } static void hci_start_interleave_scan(struct hci_dev *hdev) { hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER; queue_delayed_work(hdev->req_workqueue, &hdev->interleave_scan, 0); } static void cancel_interleave_scan(struct hci_dev *hdev) { bt_dev_dbg(hdev, "cancelling interleave scan"); cancel_delayed_work_sync(&hdev->interleave_scan); hdev->interleave_scan_state = INTERLEAVE_SCAN_NONE; } /* Return true if interleave_scan wasn't started until exiting this function, * otherwise, return false */ static bool hci_update_interleaved_scan_sync(struct hci_dev *hdev) { /* Do interleaved scan only if all of the following are true: * - There is at least one ADV monitor * - At least one pending LE connection or one device to be scanned for * - Monitor offloading is not supported * If so, we should alternate between allowlist scan and one without * any filters to save power. */ bool use_interleaving = hci_is_adv_monitoring(hdev) && !(list_empty(&hdev->pend_le_conns) && list_empty(&hdev->pend_le_reports)) && hci_get_adv_monitor_offload_ext(hdev) == HCI_ADV_MONITOR_EXT_NONE; bool is_interleaving = is_interleave_scanning(hdev); if (use_interleaving && !is_interleaving) { hci_start_interleave_scan(hdev); bt_dev_dbg(hdev, "starting interleave scan"); return true; } if (!use_interleaving && is_interleaving) cancel_interleave_scan(hdev); return false; } /* Removes connection to resolve list if needed.*/ static int hci_le_del_resolve_list_sync(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct hci_cp_le_del_from_resolv_list cp; struct bdaddr_list_with_irk *entry; if (!use_ll_privacy(hdev)) return 0; /* Check if the IRK has been programmed */ entry = hci_bdaddr_list_lookup_with_irk(&hdev->le_resolv_list, bdaddr, bdaddr_type); if (!entry) return 0; cp.bdaddr_type = bdaddr_type; bacpy(&cp.bdaddr, bdaddr); return __hci_cmd_sync_status(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_del_accept_list_sync(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct hci_cp_le_del_from_accept_list cp; int err; /* Check if device is on accept list before removing it */ if (!hci_bdaddr_list_lookup(&hdev->le_accept_list, bdaddr, bdaddr_type)) return 0; cp.bdaddr_type = bdaddr_type; bacpy(&cp.bdaddr, bdaddr); /* Ignore errors when removing from resolving list as that is likely * that the device was never added. */ hci_le_del_resolve_list_sync(hdev, &cp.bdaddr, cp.bdaddr_type); err = __hci_cmd_sync_status(hdev, HCI_OP_LE_DEL_FROM_ACCEPT_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) { bt_dev_err(hdev, "Unable to remove from allow list: %d", err); return err; } bt_dev_dbg(hdev, "Remove %pMR (0x%x) from allow list", &cp.bdaddr, cp.bdaddr_type); return 0; } struct conn_params { bdaddr_t addr; u8 addr_type; hci_conn_flags_t flags; u8 privacy_mode; }; /* Adds connection to resolve list if needed. * Setting params to NULL programs local hdev->irk */ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev, struct conn_params *params) { struct hci_cp_le_add_to_resolv_list cp; struct smp_irk *irk; struct bdaddr_list_with_irk *entry; struct hci_conn_params *p; if (!use_ll_privacy(hdev)) return 0; /* Attempt to program local identity address, type and irk if params is * NULL. */ if (!params) { if (!hci_dev_test_flag(hdev, HCI_PRIVACY)) return 0; hci_copy_identity_address(hdev, &cp.bdaddr, &cp.bdaddr_type); memcpy(cp.peer_irk, hdev->irk, 16); goto done; } irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type); if (!irk) return 0; /* Check if the IK has _not_ been programmed yet. */ entry = hci_bdaddr_list_lookup_with_irk(&hdev->le_resolv_list, &params->addr, params->addr_type); if (entry) return 0; cp.bdaddr_type = params->addr_type; bacpy(&cp.bdaddr, &params->addr); memcpy(cp.peer_irk, irk->val, 16); /* Default privacy mode is always Network */ params->privacy_mode = HCI_NETWORK_PRIVACY; rcu_read_lock(); p = hci_pend_le_action_lookup(&hdev->pend_le_conns, &params->addr, params->addr_type); if (!p) p = hci_pend_le_action_lookup(&hdev->pend_le_reports, &params->addr, params->addr_type); if (p) WRITE_ONCE(p->privacy_mode, HCI_NETWORK_PRIVACY); rcu_read_unlock(); done: if (hci_dev_test_flag(hdev, HCI_PRIVACY)) memcpy(cp.local_irk, hdev->irk, 16); else memset(cp.local_irk, 0, 16); return __hci_cmd_sync_status(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Set Device Privacy Mode. */ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, struct conn_params *params) { struct hci_cp_le_set_privacy_mode cp; struct smp_irk *irk; /* If device privacy mode has already been set there is nothing to do */ if (params->privacy_mode == HCI_DEVICE_PRIVACY) return 0; /* Check if HCI_CONN_FLAG_DEVICE_PRIVACY has been set as it also * indicates that LL Privacy has been enabled and * HCI_OP_LE_SET_PRIVACY_MODE is supported. */ if (!(params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY)) return 0; irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type); if (!irk) return 0; memset(&cp, 0, sizeof(cp)); cp.bdaddr_type = irk->addr_type; bacpy(&cp.bdaddr, &irk->bdaddr); cp.mode = HCI_DEVICE_PRIVACY; /* Note: params->privacy_mode is not updated since it is a copy */ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PRIVACY_MODE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Adds connection to allow list if needed, if the device uses RPA (has IRK) * this attempts to program the device in the resolving list as well and * properly set the privacy mode. */ static int hci_le_add_accept_list_sync(struct hci_dev *hdev, struct conn_params *params, u8 *num_entries) { struct hci_cp_le_add_to_accept_list cp; int err; /* During suspend, only wakeable devices can be in acceptlist */ if (hdev->suspended && !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) { hci_le_del_accept_list_sync(hdev, &params->addr, params->addr_type); return 0; } /* Select filter policy to accept all advertising */ if (*num_entries >= hdev->le_accept_list_size) return -ENOSPC; /* Accept list can not be used with RPAs */ if (!use_ll_privacy(hdev) && hci_find_irk_by_addr(hdev, &params->addr, params->addr_type)) return -EINVAL; /* Attempt to program the device in the resolving list first to avoid * having to rollback in case it fails since the resolving list is * dynamic it can probably be smaller than the accept list. */ err = hci_le_add_resolve_list_sync(hdev, params); if (err) { bt_dev_err(hdev, "Unable to add to resolve list: %d", err); return err; } /* Set Privacy Mode */ err = hci_le_set_privacy_mode_sync(hdev, params); if (err) { bt_dev_err(hdev, "Unable to set privacy mode: %d", err); return err; } /* Check if already in accept list */ if (hci_bdaddr_list_lookup(&hdev->le_accept_list, &params->addr, params->addr_type)) return 0; *num_entries += 1; cp.bdaddr_type = params->addr_type; bacpy(&cp.bdaddr, &params->addr); err = __hci_cmd_sync_status(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) { bt_dev_err(hdev, "Unable to add to allow list: %d", err); /* Rollback the device from the resolving list */ hci_le_del_resolve_list_sync(hdev, &cp.bdaddr, cp.bdaddr_type); return err; } bt_dev_dbg(hdev, "Add %pMR (0x%x) to allow list", &cp.bdaddr, cp.bdaddr_type); return 0; } /* This function disables/pause all advertising instances */ static int hci_pause_advertising_sync(struct hci_dev *hdev) { int err; int old_state; /* If already been paused there is nothing to do. */ if (hdev->advertising_paused) return 0; bt_dev_dbg(hdev, "Pausing directed advertising"); /* Stop directed advertising */ old_state = hci_dev_test_flag(hdev, HCI_ADVERTISING); if (old_state) { /* When discoverable timeout triggers, then just make sure * the limited discoverable flag is cleared. Even in the case * of a timeout triggered from general discoverable, it is * safe to unconditionally clear the flag. */ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hdev->discov_timeout = 0; } bt_dev_dbg(hdev, "Pausing advertising instances"); /* Call to disable any advertisements active on the controller. * This will succeed even if no advertisements are configured. */ err = hci_disable_advertising_sync(hdev); if (err) return err; /* If we are using software rotation, pause the loop */ if (!ext_adv_capable(hdev)) cancel_adv_timeout(hdev); hdev->advertising_paused = true; hdev->advertising_old_state = old_state; return 0; } /* This function enables all user advertising instances */ static int hci_resume_advertising_sync(struct hci_dev *hdev) { struct adv_info *adv, *tmp; int err; /* If advertising has not been paused there is nothing to do. */ if (!hdev->advertising_paused) return 0; /* Resume directed advertising */ hdev->advertising_paused = false; if (hdev->advertising_old_state) { hci_dev_set_flag(hdev, HCI_ADVERTISING); hdev->advertising_old_state = 0; } bt_dev_dbg(hdev, "Resuming advertising instances"); if (ext_adv_capable(hdev)) { /* Call for each tracked instance to be re-enabled */ list_for_each_entry_safe(adv, tmp, &hdev->adv_instances, list) { err = hci_enable_ext_advertising_sync(hdev, adv->instance); if (!err) continue; /* If the instance cannot be resumed remove it */ hci_remove_ext_adv_instance_sync(hdev, adv->instance, NULL); } } else { /* Schedule for most recent instance to be restarted and begin * the software rotation loop */ err = hci_schedule_adv_instance_sync(hdev, hdev->cur_adv_instance, true); } hdev->advertising_paused = false; return err; } static int hci_pause_addr_resolution(struct hci_dev *hdev) { int err; if (!use_ll_privacy(hdev)) return 0; if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) return 0; /* Cannot disable addr resolution if scanning is enabled or * when initiating an LE connection. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) || hci_lookup_le_connect(hdev)) { bt_dev_err(hdev, "Command not allowed when scan/LE connect"); return -EPERM; } /* Cannot disable addr resolution if advertising is enabled. */ err = hci_pause_advertising_sync(hdev); if (err) { bt_dev_err(hdev, "Pause advertising failed: %d", err); return err; } err = hci_le_set_addr_resolution_enable_sync(hdev, 0x00); if (err) bt_dev_err(hdev, "Unable to disable Address Resolution: %d", err); /* Return if address resolution is disabled and RPA is not used. */ if (!err && scan_use_rpa(hdev)) return 0; hci_resume_advertising_sync(hdev); return err; } struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, bool extended, struct sock *sk) { u16 opcode = extended ? HCI_OP_READ_LOCAL_OOB_EXT_DATA : HCI_OP_READ_LOCAL_OOB_DATA; return __hci_cmd_sync_sk(hdev, opcode, 0, NULL, 0, HCI_CMD_TIMEOUT, sk); } static struct conn_params *conn_params_copy(struct list_head *list, size_t *n) { struct hci_conn_params *params; struct conn_params *p; size_t i; rcu_read_lock(); i = 0; list_for_each_entry_rcu(params, list, action) ++i; *n = i; rcu_read_unlock(); p = kvcalloc(*n, sizeof(struct conn_params), GFP_KERNEL); if (!p) return NULL; rcu_read_lock(); i = 0; list_for_each_entry_rcu(params, list, action) { /* Racing adds are handled in next scan update */ if (i >= *n) break; /* No hdev->lock, but: addr, addr_type are immutable. * privacy_mode is only written by us or in * hci_cc_le_set_privacy_mode that we wait for. * We should be idempotent so MGMT updating flags * while we are processing is OK. */ bacpy(&p[i].addr, &params->addr); p[i].addr_type = params->addr_type; p[i].flags = READ_ONCE(params->flags); p[i].privacy_mode = READ_ONCE(params->privacy_mode); ++i; } rcu_read_unlock(); *n = i; return p; } /* Clear LE Accept List */ static int hci_le_clear_accept_list_sync(struct hci_dev *hdev) { if (!(hdev->commands[26] & 0x80)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_ACCEPT_LIST, 0, NULL, HCI_CMD_TIMEOUT); } /* Device must not be scanning when updating the accept list. * * Update is done using the following sequence: * * use_ll_privacy((Disable Advertising) -> Disable Resolving List) -> * Remove Devices From Accept List -> * (has IRK && use_ll_privacy(Remove Devices From Resolving List))-> * Add Devices to Accept List -> * (has IRK && use_ll_privacy(Remove Devices From Resolving List)) -> * use_ll_privacy(Enable Resolving List -> (Enable Advertising)) -> * Enable Scanning * * In case of failure advertising shall be restored to its original state and * return would disable accept list since either accept or resolving list could * not be programmed. * */ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) { struct conn_params *params; struct bdaddr_list *b, *t; u8 num_entries = 0; bool pend_conn, pend_report; u8 filter_policy; size_t i, n; int err; /* Pause advertising if resolving list can be used as controllers * cannot accept resolving list modifications while advertising. */ if (use_ll_privacy(hdev)) { err = hci_pause_advertising_sync(hdev); if (err) { bt_dev_err(hdev, "pause advertising failed: %d", err); return 0x00; } } /* Disable address resolution while reprogramming accept list since * devices that do have an IRK will be programmed in the resolving list * when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x00); if (err) { bt_dev_err(hdev, "Unable to disable LL privacy: %d", err); goto done; } /* Force address filtering if PA Sync is in progress */ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { struct hci_cp_le_pa_create_sync *sent; sent = hci_sent_cmd_data(hdev, HCI_OP_LE_PA_CREATE_SYNC); if (sent) { struct conn_params pa; memset(&pa, 0, sizeof(pa)); bacpy(&pa.addr, &sent->addr); pa.addr_type = sent->addr_type; /* Clear first since there could be addresses left * behind. */ hci_le_clear_accept_list_sync(hdev); num_entries = 1; err = hci_le_add_accept_list_sync(hdev, &pa, &num_entries); goto done; } } /* Go through the current accept list programmed into the * controller one by one and check if that address is connected or is * still in the list of pending connections or list of devices to * report. If not present in either list, then remove it from * the controller. */ list_for_each_entry_safe(b, t, &hdev->le_accept_list, list) { if (hci_conn_hash_lookup_le(hdev, &b->bdaddr, b->bdaddr_type)) continue; /* Pointers not dereferenced, no locks needed */ pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns, &b->bdaddr, b->bdaddr_type); pend_report = hci_pend_le_action_lookup(&hdev->pend_le_reports, &b->bdaddr, b->bdaddr_type); /* If the device is not likely to connect or report, * remove it from the acceptlist. */ if (!pend_conn && !pend_report) { hci_le_del_accept_list_sync(hdev, &b->bdaddr, b->bdaddr_type); continue; } num_entries++; } /* Since all no longer valid accept list entries have been * removed, walk through the list of pending connections * and ensure that any new device gets programmed into * the controller. * * If the list of the devices is larger than the list of * available accept list entries in the controller, then * just abort and return filer policy value to not use the * accept list. * * The list and params may be mutated while we wait for events, * so make a copy and iterate it. */ params = conn_params_copy(&hdev->pend_le_conns, &n); if (!params) { err = -ENOMEM; goto done; } for (i = 0; i < n; ++i) { err = hci_le_add_accept_list_sync(hdev, &params[i], &num_entries); if (err) { kvfree(params); goto done; } } kvfree(params); /* After adding all new pending connections, walk through * the list of pending reports and also add these to the * accept list if there is still space. Abort if space runs out. */ params = conn_params_copy(&hdev->pend_le_reports, &n); if (!params) { err = -ENOMEM; goto done; } for (i = 0; i < n; ++i) { err = hci_le_add_accept_list_sync(hdev, &params[i], &num_entries); if (err) { kvfree(params); goto done; } } kvfree(params); /* Use the allowlist unless the following conditions are all true: * - We are not currently suspending * - There are 1 or more ADV monitors registered and it's not offloaded * - Interleaved scanning is not currently using the allowlist */ if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended && hci_get_adv_monitor_offload_ext(hdev) == HCI_ADV_MONITOR_EXT_NONE && hdev->interleave_scan_state != INTERLEAVE_SCAN_ALLOWLIST) err = -EINVAL; done: filter_policy = err ? 0x00 : 0x01; /* Enable address resolution when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x01); if (err) bt_dev_err(hdev, "Unable to enable LL privacy: %d", err); /* Resume advertising if it was paused */ if (use_ll_privacy(hdev)) hci_resume_advertising_sync(hdev); /* Select filter policy to use accept list */ return filter_policy; } static void hci_le_scan_phy_params(struct hci_cp_le_scan_phy_params *cp, u8 type, u16 interval, u16 window) { cp->type = type; cp->interval = cpu_to_le16(interval); cp->window = cpu_to_le16(window); } static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) { struct hci_cp_le_set_ext_scan_params *cp; struct hci_cp_le_scan_phy_params *phy; u8 data[sizeof(*cp) + sizeof(*phy) * 2]; u8 num_phy = 0x00; cp = (void *)data; phy = (void *)cp->data; memset(data, 0, sizeof(data)); cp->own_addr_type = own_addr_type; cp->filter_policy = filter_policy; /* Check if PA Sync is in progress then select the PHY based on the * hci_conn.iso_qos. */ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { struct hci_cp_le_add_to_accept_list *sent; sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST); if (sent) { struct hci_conn *conn; conn = hci_conn_hash_lookup_ba(hdev, ISO_LINK, &sent->bdaddr); if (conn) { struct bt_iso_qos *qos = &conn->iso_qos; if (qos->bcast.in.phy & BT_ISO_PHY_1M || qos->bcast.in.phy & BT_ISO_PHY_2M) { cp->scanning_phys |= LE_SCAN_PHY_1M; hci_le_scan_phy_params(phy, type, interval, window); num_phy++; phy++; } if (qos->bcast.in.phy & BT_ISO_PHY_CODED) { cp->scanning_phys |= LE_SCAN_PHY_CODED; hci_le_scan_phy_params(phy, type, interval * 3, window * 3); num_phy++; phy++; } if (num_phy) goto done; } } } if (scan_1m(hdev) || scan_2m(hdev)) { cp->scanning_phys |= LE_SCAN_PHY_1M; hci_le_scan_phy_params(phy, type, interval, window); num_phy++; phy++; } if (scan_coded(hdev)) { cp->scanning_phys |= LE_SCAN_PHY_CODED; hci_le_scan_phy_params(phy, type, interval * 3, window * 3); num_phy++; phy++; } done: if (!num_phy) return -EINVAL; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS, sizeof(*cp) + sizeof(*phy) * num_phy, data, HCI_CMD_TIMEOUT); } static int hci_le_set_scan_param_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) { struct hci_cp_le_set_scan_param cp; if (use_ext_scan(hdev)) return hci_le_set_ext_scan_param_sync(hdev, type, interval, window, own_addr_type, filter_policy); memset(&cp, 0, sizeof(cp)); cp.type = type; cp.interval = cpu_to_le16(interval); cp.window = cpu_to_le16(window); cp.own_address_type = own_addr_type; cp.filter_policy = filter_policy; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_PARAM, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_start_scan_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy, u8 filter_dup) { int err; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); return 0; } err = hci_le_set_scan_param_sync(hdev, type, interval, window, own_addr_type, filter_policy); if (err) return err; return hci_le_set_scan_enable_sync(hdev, LE_SCAN_ENABLE, filter_dup); } static int hci_passive_scan_sync(struct hci_dev *hdev) { u8 own_addr_type; u8 filter_policy; u16 window, interval; u8 filter_dups = LE_SCAN_FILTER_DUP_ENABLE; int err; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); return 0; } err = hci_scan_disable_sync(hdev); if (err) { bt_dev_err(hdev, "disable scanning failed: %d", err); return err; } /* Set require_privacy to false since no SCAN_REQ are send * during passive scanning. Not using an non-resolvable address * here is important so that peer devices using direct * advertising with our address will be correctly reported * by the controller. */ if (hci_update_random_address_sync(hdev, false, scan_use_rpa(hdev), &own_addr_type)) return 0; if (hdev->enable_advmon_interleave_scan && hci_update_interleaved_scan_sync(hdev)) return 0; bt_dev_dbg(hdev, "interleave state %d", hdev->interleave_scan_state); /* Adding or removing entries from the accept list must * happen before enabling scanning. The controller does * not allow accept list modification while scanning. */ filter_policy = hci_update_accept_list_sync(hdev); /* If suspended and filter_policy set to 0x00 (no acceptlist) then * passive scanning cannot be started since that would require the host * to be woken up to process the reports. */ if (hdev->suspended && !filter_policy) { /* Check if accept list is empty then there is no need to scan * while suspended. */ if (list_empty(&hdev->le_accept_list)) return 0; /* If there are devices is the accept_list that means some * devices could not be programmed which in non-suspended case * means filter_policy needs to be set to 0x00 so the host needs * to filter, but since this is treating suspended case we * can ignore device needing host to filter to allow devices in * the acceptlist to be able to wakeup the system. */ filter_policy = 0x01; } /* When the controller is using random resolvable addresses and * with that having LE privacy enabled, then controllers with * Extended Scanner Filter Policies support can now enable support * for handling directed advertising. * * So instead of using filter polices 0x00 (no acceptlist) * and 0x01 (acceptlist enabled) use the new filter policies * 0x02 (no acceptlist) and 0x03 (acceptlist enabled). */ if (hci_dev_test_flag(hdev, HCI_PRIVACY) && (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)) filter_policy |= 0x02; if (hdev->suspended) { window = hdev->le_scan_window_suspend; interval = hdev->le_scan_int_suspend; } else if (hci_is_le_conn_scanning(hdev)) { window = hdev->le_scan_window_connect; interval = hdev->le_scan_int_connect; } else if (hci_is_adv_monitoring(hdev)) { window = hdev->le_scan_window_adv_monitor; interval = hdev->le_scan_int_adv_monitor; /* Disable duplicates filter when scanning for advertisement * monitor for the following reasons. * * For HW pattern filtering (ex. MSFT), Realtek and Qualcomm * controllers ignore RSSI_Sampling_Period when the duplicates * filter is enabled. * * For SW pattern filtering, when we're not doing interleaved * scanning, it is necessary to disable duplicates filter, * otherwise hosts can only receive one advertisement and it's * impossible to know if a peer is still in range. */ filter_dups = LE_SCAN_FILTER_DUP_DISABLE; } else { window = hdev->le_scan_window; interval = hdev->le_scan_interval; } /* Disable all filtering for Mesh */ if (hci_dev_test_flag(hdev, HCI_MESH)) { filter_policy = 0; filter_dups = LE_SCAN_FILTER_DUP_DISABLE; } bt_dev_dbg(hdev, "LE passive scan with acceptlist = %d", filter_policy); return hci_start_scan_sync(hdev, LE_SCAN_PASSIVE, interval, window, own_addr_type, filter_policy, filter_dups); } /* This function controls the passive scanning based on hdev->pend_le_conns * list. If there are pending LE connection we start the background scanning, * otherwise we stop it in the following sequence: * * If there are devices to scan: * * Disable Scanning -> Update Accept List -> * use_ll_privacy((Disable Advertising) -> Disable Resolving List -> * Update Resolving List -> Enable Resolving List -> (Enable Advertising)) -> * Enable Scanning * * Otherwise: * * Disable Scanning */ int hci_update_passive_scan_sync(struct hci_dev *hdev) { int err; if (!test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || hci_dev_test_flag(hdev, HCI_AUTO_OFF) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* No point in doing scanning if LE support hasn't been enabled */ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; /* If discovery is active don't interfere with it */ if (hdev->discovery.state != DISCOVERY_STOPPED) return 0; /* Reset RSSI and UUID filters when starting background scanning * since these filters are meant for service discovery only. * * The Start Discovery and Start Service Discovery operations * ensure to set proper values for RSSI threshold and UUID * filter list. So it is safe to just reset them here. */ hci_discovery_filter_clear(hdev); bt_dev_dbg(hdev, "ADV monitoring is %s", hci_is_adv_monitoring(hdev) ? "on" : "off"); if (!hci_dev_test_flag(hdev, HCI_MESH) && list_empty(&hdev->pend_le_conns) && list_empty(&hdev->pend_le_reports) && !hci_is_adv_monitoring(hdev) && !hci_dev_test_flag(hdev, HCI_PA_SYNC)) { /* If there is no pending LE connections or devices * to be scanned for or no ADV monitors, we should stop the * background scanning. */ bt_dev_dbg(hdev, "stopping background scanning"); err = hci_scan_disable_sync(hdev); if (err) bt_dev_err(hdev, "stop background scanning failed: %d", err); } else { /* If there is at least one pending LE connection, we should * keep the background scan running. */ /* If controller is connecting, we should not start scanning * since some controllers are not able to scan and connect at * the same time. */ if (hci_lookup_le_connect(hdev)) return 0; bt_dev_dbg(hdev, "start background scanning"); err = hci_passive_scan_sync(hdev); if (err) bt_dev_err(hdev, "start background scanning failed: %d", err); } return err; } static int update_scan_sync(struct hci_dev *hdev, void *data) { return hci_update_scan_sync(hdev); } int hci_update_scan(struct hci_dev *hdev) { return hci_cmd_sync_queue(hdev, update_scan_sync, NULL, NULL); } static int update_passive_scan_sync(struct hci_dev *hdev, void *data) { return hci_update_passive_scan_sync(hdev); } int hci_update_passive_scan(struct hci_dev *hdev) { /* Only queue if it would have any effect */ if (!test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || hci_dev_test_flag(hdev, HCI_AUTO_OFF) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; return hci_cmd_sync_queue_once(hdev, update_passive_scan_sync, NULL, NULL); } int hci_write_sc_support_sync(struct hci_dev *hdev, u8 val) { int err; if (!bredr_sc_enabled(hdev) || lmp_host_sc_capable(hdev)) return 0; err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SC_SUPPORT, sizeof(val), &val, HCI_CMD_TIMEOUT); if (!err) { if (val) { hdev->features[1][0] |= LMP_HOST_SC; hci_dev_set_flag(hdev, HCI_SC_ENABLED); } else { hdev->features[1][0] &= ~LMP_HOST_SC; hci_dev_clear_flag(hdev, HCI_SC_ENABLED); } } return err; } int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode) { int err; if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED) || lmp_host_ssp_capable(hdev)) return 0; if (!mode && hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); } err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); if (err) return err; return hci_write_sc_support_sync(hdev, 0x01); } int hci_write_le_host_supported_sync(struct hci_dev *hdev, u8 le, u8 simul) { struct hci_cp_write_le_host_supported cp; if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) || !lmp_bredr_capable(hdev)) return 0; /* Check first if we already have the right host state * (host features set) */ if (le == lmp_host_le_capable(hdev) && simul == lmp_host_le_br_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); cp.le = le; cp.simul = simul; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_powered_update_adv_sync(struct hci_dev *hdev) { struct adv_info *adv, *tmp; int err; if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; /* If RPA Resolution has not been enable yet it means the * resolving list is empty and we should attempt to program the * local IRK in order to support using own_addr_type * ADDR_LE_DEV_RANDOM_RESOLVED (0x03). */ if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { hci_le_add_resolve_list_sync(hdev, NULL); hci_le_set_addr_resolution_enable_sync(hdev, 0x01); } /* Make sure the controller has a good default for * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || list_empty(&hdev->adv_instances)) { if (ext_adv_capable(hdev)) { err = hci_setup_ext_adv_instance_sync(hdev, 0x00); if (!err) hci_update_scan_rsp_data_sync(hdev, 0x00); } else { err = hci_update_adv_data_sync(hdev, 0x00); if (!err) hci_update_scan_rsp_data_sync(hdev, 0x00); } if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) hci_enable_advertising_sync(hdev); } /* Call for each tracked instance to be scheduled */ list_for_each_entry_safe(adv, tmp, &hdev->adv_instances, list) hci_schedule_adv_instance_sync(hdev, adv->instance, true); return 0; } static int hci_write_auth_enable_sync(struct hci_dev *hdev) { u8 link_sec; link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY); if (link_sec == test_bit(HCI_AUTH, &hdev->flags)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, sizeof(link_sec), &link_sec, HCI_CMD_TIMEOUT); } int hci_write_fast_connectable_sync(struct hci_dev *hdev, bool enable) { struct hci_cp_write_page_scan_activity cp; u8 type; int err = 0; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; memset(&cp, 0, sizeof(cp)); if (enable) { type = PAGE_SCAN_TYPE_INTERLACED; /* 160 msec page scan interval */ cp.interval = cpu_to_le16(0x0100); } else { type = hdev->def_page_scan_type; cp.interval = cpu_to_le16(hdev->def_page_scan_int); } cp.window = cpu_to_le16(hdev->def_page_scan_window); if (__cpu_to_le16(hdev->page_scan_interval) != cp.interval || __cpu_to_le16(hdev->page_scan_window) != cp.window) { err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) return err; } if (hdev->page_scan_type != type) err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_PAGE_SCAN_TYPE, sizeof(type), &type, HCI_CMD_TIMEOUT); return err; } static bool disconnected_accept_list_entries(struct hci_dev *hdev) { struct bdaddr_list *b; list_for_each_entry(b, &hdev->accept_list, list) { struct hci_conn *conn; conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr); if (!conn) return true; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) return true; } return false; } static int hci_write_scan_enable_sync(struct hci_dev *hdev, u8 val) { return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE, sizeof(val), &val, HCI_CMD_TIMEOUT); } int hci_update_scan_sync(struct hci_dev *hdev) { u8 scan; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (!hdev_is_powered(hdev)) return 0; if (mgmt_powering_down(hdev)) return 0; if (hdev->scanning_paused) return 0; if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) || disconnected_accept_list_entries(hdev)) scan = SCAN_PAGE; else scan = SCAN_DISABLED; if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) scan |= SCAN_INQUIRY; if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE) && test_bit(HCI_ISCAN, &hdev->flags) == !!(scan & SCAN_INQUIRY)) return 0; return hci_write_scan_enable_sync(hdev, scan); } int hci_update_name_sync(struct hci_dev *hdev) { struct hci_cp_write_local_name cp; memset(&cp, 0, sizeof(cp)); memcpy(cp.name, hdev->dev_name, sizeof(cp.name)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* This function perform powered update HCI command sequence after the HCI init * sequence which end up resetting all states, the sequence is as follows: * * HCI_SSP_ENABLED(Enable SSP) * HCI_LE_ENABLED(Enable LE) * HCI_LE_ENABLED(use_ll_privacy(Add local IRK to Resolving List) -> * Update adv data) * Enable Authentication * lmp_bredr_capable(Set Fast Connectable -> Set Scan Type -> Set Class -> * Set Name -> Set EIR) * HCI_FORCE_STATIC_ADDR | BDADDR_ANY && !HCI_BREDR_ENABLED (Set Static Address) */ int hci_powered_update_sync(struct hci_dev *hdev) { int err; /* Register the available SMP channels (BR/EDR and LE) only when * successfully powering on the controller. This late * registration is required so that LE SMP can clearly decide if * the public address or static address is used. */ smp_register(hdev); err = hci_write_ssp_mode_sync(hdev, 0x01); if (err) return err; err = hci_write_le_host_supported_sync(hdev, 0x01, 0x00); if (err) return err; err = hci_powered_update_adv_sync(hdev); if (err) return err; err = hci_write_auth_enable_sync(hdev); if (err) return err; if (lmp_bredr_capable(hdev)) { if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) hci_write_fast_connectable_sync(hdev, true); else hci_write_fast_connectable_sync(hdev, false); hci_update_scan_sync(hdev); hci_update_class_sync(hdev); hci_update_name_sync(hdev); hci_update_eir_sync(hdev); } /* If forcing static address is in use or there is no public * address use the static address as random address (but skip * the HCI command if the current random address is already the * static one. * * In case BR/EDR has been disabled on a dual-mode controller * and a static address has been configured, then use that * address instead of the public BR/EDR address. */ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))) { if (bacmp(&hdev->static_addr, BDADDR_ANY)) return hci_set_random_addr_sync(hdev, &hdev->static_addr); } return 0; } /** * hci_dev_get_bd_addr_from_property - Get the Bluetooth Device Address * (BD_ADDR) for a HCI device from * a firmware node property. * @hdev: The HCI device * * Search the firmware node for 'local-bd-address'. * * All-zero BD addresses are rejected, because those could be properties * that exist in the firmware tables, but were not updated by the firmware. For * example, the DTS could define 'local-bd-address', with zero BD addresses. */ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev) { struct fwnode_handle *fwnode = dev_fwnode(hdev->dev.parent); bdaddr_t ba; int ret; ret = fwnode_property_read_u8_array(fwnode, "local-bd-address", (u8 *)&ba, sizeof(ba)); if (ret < 0 || !bacmp(&ba, BDADDR_ANY)) return; if (test_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks)) baswap(&hdev->public_addr, &ba); else bacpy(&hdev->public_addr, &ba); } struct hci_init_stage { int (*func)(struct hci_dev *hdev); }; /* Run init stage NULL terminated function table */ static int hci_init_stage_sync(struct hci_dev *hdev, const struct hci_init_stage *stage) { size_t i; for (i = 0; stage[i].func; i++) { int err; err = stage[i].func(hdev); if (err) return err; } return 0; } /* Read Local Version */ static int hci_read_local_version_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL, HCI_CMD_TIMEOUT); } /* Read BD Address */ static int hci_read_bd_addr_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_BD_ADDR, 0, NULL, HCI_CMD_TIMEOUT); } #define HCI_INIT(_func) \ { \ .func = _func, \ } static const struct hci_init_stage hci_init0[] = { /* HCI_OP_READ_LOCAL_VERSION */ HCI_INIT(hci_read_local_version_sync), /* HCI_OP_READ_BD_ADDR */ HCI_INIT(hci_read_bd_addr_sync), {} }; int hci_reset_sync(struct hci_dev *hdev) { int err; set_bit(HCI_RESET, &hdev->flags); err = __hci_cmd_sync_status(hdev, HCI_OP_RESET, 0, NULL, HCI_CMD_TIMEOUT); if (err) return err; return 0; } static int hci_init0_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); /* Reset */ if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { err = hci_reset_sync(hdev); if (err) return err; } return hci_init_stage_sync(hdev, hci_init0); } static int hci_unconf_init_sync(struct hci_dev *hdev) { int err; if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return 0; err = hci_init0_sync(hdev); if (err < 0) return err; if (hci_dev_test_flag(hdev, HCI_SETUP)) hci_debugfs_create_basic(hdev); return 0; } /* Read Local Supported Features. */ static int hci_read_local_features_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_FEATURES, 0, NULL, HCI_CMD_TIMEOUT); } /* BR Controller init stage 1 command sequence */ static const struct hci_init_stage br_init1[] = { /* HCI_OP_READ_LOCAL_FEATURES */ HCI_INIT(hci_read_local_features_sync), /* HCI_OP_READ_LOCAL_VERSION */ HCI_INIT(hci_read_local_version_sync), /* HCI_OP_READ_BD_ADDR */ HCI_INIT(hci_read_bd_addr_sync), {} }; /* Read Local Commands */ static int hci_read_local_cmds_sync(struct hci_dev *hdev) { /* All Bluetooth 1.2 and later controllers should support the * HCI command for reading the local supported commands. * * Unfortunately some controllers indicate Bluetooth 1.2 support, * but do not have support for this command. If that is the case, * the driver can quirk the behavior and skip reading the local * supported commands. */ if (hdev->hci_ver > BLUETOOTH_VER_1_1 && !test_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks)) return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL, HCI_CMD_TIMEOUT); return 0; } static int hci_init1_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); /* Reset */ if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { err = hci_reset_sync(hdev); if (err) return err; } return hci_init_stage_sync(hdev, br_init1); } /* Read Buffer Size (ACL mtu, max pkt, etc.) */ static int hci_read_buffer_size_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Class of Device */ static int hci_read_dev_class_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_CLASS_OF_DEV, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Local Name */ static int hci_read_local_name_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_NAME, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Voice Setting */ static int hci_read_voice_setting_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_VOICE_SETTING, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Number of Supported IAC */ static int hci_read_num_supported_iac_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_NUM_SUPPORTED_IAC, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Current IAC LAP */ static int hci_read_current_iac_lap_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_CURRENT_IAC_LAP, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_set_event_filter_sync(struct hci_dev *hdev, u8 flt_type, u8 cond_type, bdaddr_t *bdaddr, u8 auto_accept) { struct hci_cp_set_event_filter cp; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) return 0; memset(&cp, 0, sizeof(cp)); cp.flt_type = flt_type; if (flt_type != HCI_FLT_CLEAR_ALL) { cp.cond_type = cond_type; bacpy(&cp.addr_conn_flt.bdaddr, bdaddr); cp.addr_conn_flt.auto_accept = auto_accept; } return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_FLT, flt_type == HCI_FLT_CLEAR_ALL ? sizeof(cp.flt_type) : sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_clear_event_filter_sync(struct hci_dev *hdev) { if (!hci_dev_test_flag(hdev, HCI_EVENT_FILTER_CONFIGURED)) return 0; /* In theory the state machine should not reach here unless * a hci_set_event_filter_sync() call succeeds, but we do * the check both for parity and as a future reminder. */ if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) return 0; return hci_set_event_filter_sync(hdev, HCI_FLT_CLEAR_ALL, 0x00, BDADDR_ANY, 0x00); } /* Connection accept timeout ~20 secs */ static int hci_write_ca_timeout_sync(struct hci_dev *hdev) { __le16 param = cpu_to_le16(0x7d00); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CA_TIMEOUT, sizeof(param), &param, HCI_CMD_TIMEOUT); } /* BR Controller init stage 2 command sequence */ static const struct hci_init_stage br_init2[] = { /* HCI_OP_READ_BUFFER_SIZE */ HCI_INIT(hci_read_buffer_size_sync), /* HCI_OP_READ_CLASS_OF_DEV */ HCI_INIT(hci_read_dev_class_sync), /* HCI_OP_READ_LOCAL_NAME */ HCI_INIT(hci_read_local_name_sync), /* HCI_OP_READ_VOICE_SETTING */ HCI_INIT(hci_read_voice_setting_sync), /* HCI_OP_READ_NUM_SUPPORTED_IAC */ HCI_INIT(hci_read_num_supported_iac_sync), /* HCI_OP_READ_CURRENT_IAC_LAP */ HCI_INIT(hci_read_current_iac_lap_sync), /* HCI_OP_SET_EVENT_FLT */ HCI_INIT(hci_clear_event_filter_sync), /* HCI_OP_WRITE_CA_TIMEOUT */ HCI_INIT(hci_write_ca_timeout_sync), {} }; static int hci_write_ssp_mode_1_sync(struct hci_dev *hdev) { u8 mode = 0x01; if (!lmp_ssp_capable(hdev) || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return 0; /* When SSP is available, then the host features page * should also be available as well. However some * controllers list the max_page as 0 as long as SSP * has not been enabled. To achieve proper debugging * output, force the minimum max_page to 1 at least. */ hdev->max_page = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); } static int hci_write_eir_sync(struct hci_dev *hdev) { struct hci_cp_write_eir cp; if (!lmp_ssp_capable(hdev) || hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return 0; memset(hdev->eir, 0, sizeof(hdev->eir)); memset(&cp, 0, sizeof(cp)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_write_inquiry_mode_sync(struct hci_dev *hdev) { u8 mode; if (!lmp_inq_rssi_capable(hdev) && !test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) return 0; /* If Extended Inquiry Result events are supported, then * they are clearly preferred over Inquiry Result with RSSI * events. */ mode = lmp_ext_inq_capable(hdev) ? 0x02 : 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_INQUIRY_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); } static int hci_read_inq_rsp_tx_power_sync(struct hci_dev *hdev) { if (!lmp_inq_tx_pwr_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_INQ_RSP_TX_POWER, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_read_local_ext_features_sync(struct hci_dev *hdev, u8 page) { struct hci_cp_read_local_ext_features cp; if (!lmp_ext_feat_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); cp.page = page; return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_read_local_ext_features_1_sync(struct hci_dev *hdev) { return hci_read_local_ext_features_sync(hdev, 0x01); } /* HCI Controller init stage 2 command sequence */ static const struct hci_init_stage hci_init2[] = { /* HCI_OP_READ_LOCAL_COMMANDS */ HCI_INIT(hci_read_local_cmds_sync), /* HCI_OP_WRITE_SSP_MODE */ HCI_INIT(hci_write_ssp_mode_1_sync), /* HCI_OP_WRITE_EIR */ HCI_INIT(hci_write_eir_sync), /* HCI_OP_WRITE_INQUIRY_MODE */ HCI_INIT(hci_write_inquiry_mode_sync), /* HCI_OP_READ_INQ_RSP_TX_POWER */ HCI_INIT(hci_read_inq_rsp_tx_power_sync), /* HCI_OP_READ_LOCAL_EXT_FEATURES */ HCI_INIT(hci_read_local_ext_features_1_sync), /* HCI_OP_WRITE_AUTH_ENABLE */ HCI_INIT(hci_write_auth_enable_sync), {} }; /* Read LE Buffer Size */ static int hci_le_read_buffer_size_sync(struct hci_dev *hdev) { /* Use Read LE Buffer Size V2 if supported */ if (iso_capable(hdev) && hdev->commands[41] & 0x20) return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE_V2, 0, NULL, HCI_CMD_TIMEOUT); return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Local Supported Features */ static int hci_le_read_local_features_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Supported States */ static int hci_le_read_supported_states_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL, HCI_CMD_TIMEOUT); } /* LE Controller init stage 2 command sequence */ static const struct hci_init_stage le_init2[] = { /* HCI_OP_LE_READ_LOCAL_FEATURES */ HCI_INIT(hci_le_read_local_features_sync), /* HCI_OP_LE_READ_BUFFER_SIZE */ HCI_INIT(hci_le_read_buffer_size_sync), /* HCI_OP_LE_READ_SUPPORTED_STATES */ HCI_INIT(hci_le_read_supported_states_sync), {} }; static int hci_init2_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_init_stage_sync(hdev, hci_init2); if (err) return err; if (lmp_bredr_capable(hdev)) { err = hci_init_stage_sync(hdev, br_init2); if (err) return err; } else { hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED); } if (lmp_le_capable(hdev)) { err = hci_init_stage_sync(hdev, le_init2); if (err) return err; /* LE-only controllers have LE implicitly enabled */ if (!lmp_bredr_capable(hdev)) hci_dev_set_flag(hdev, HCI_LE_ENABLED); } return 0; } static int hci_set_event_mask_sync(struct hci_dev *hdev) { /* The second byte is 0xff instead of 0x9f (two reserved bits * disabled) since a Broadcom 1.2 dongle doesn't respond to the * command otherwise. */ u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 }; /* CSR 1.1 dongles does not accept any bitfield so don't try to set * any event mask for pre 1.2 devices. */ if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; if (lmp_bredr_capable(hdev)) { events[4] |= 0x01; /* Flow Specification Complete */ /* Don't set Disconnect Complete and mode change when * suspended as that would wakeup the host when disconnecting * due to suspend. */ if (hdev->suspended) { events[0] &= 0xef; events[2] &= 0xf7; } } else { /* Use a different default for LE-only devices */ memset(events, 0, sizeof(events)); events[1] |= 0x20; /* Command Complete */ events[1] |= 0x40; /* Command Status */ events[1] |= 0x80; /* Hardware Error */ /* If the controller supports the Disconnect command, enable * the corresponding event. In addition enable packet flow * control related events. */ if (hdev->commands[0] & 0x20) { /* Don't set Disconnect Complete when suspended as that * would wakeup the host when disconnecting due to * suspend. */ if (!hdev->suspended) events[0] |= 0x10; /* Disconnection Complete */ events[2] |= 0x04; /* Number of Completed Packets */ events[3] |= 0x02; /* Data Buffer Overflow */ } /* If the controller supports the Read Remote Version * Information command, enable the corresponding event. */ if (hdev->commands[2] & 0x80) events[1] |= 0x08; /* Read Remote Version Information * Complete */ if (hdev->le_features[0] & HCI_LE_ENCRYPTION) { events[0] |= 0x80; /* Encryption Change */ events[5] |= 0x80; /* Encryption Key Refresh Complete */ } } if (lmp_inq_rssi_capable(hdev) || test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) events[4] |= 0x02; /* Inquiry Result with RSSI */ if (lmp_ext_feat_capable(hdev)) events[4] |= 0x04; /* Read Remote Extended Features Complete */ if (lmp_esco_capable(hdev)) { events[5] |= 0x08; /* Synchronous Connection Complete */ events[5] |= 0x10; /* Synchronous Connection Changed */ } if (lmp_sniffsubr_capable(hdev)) events[5] |= 0x20; /* Sniff Subrating */ if (lmp_pause_enc_capable(hdev)) events[5] |= 0x80; /* Encryption Key Refresh Complete */ if (lmp_ext_inq_capable(hdev)) events[5] |= 0x40; /* Extended Inquiry Result */ if (lmp_no_flush_capable(hdev)) events[7] |= 0x01; /* Enhanced Flush Complete */ if (lmp_lsto_capable(hdev)) events[6] |= 0x80; /* Link Supervision Timeout Changed */ if (lmp_ssp_capable(hdev)) { events[6] |= 0x01; /* IO Capability Request */ events[6] |= 0x02; /* IO Capability Response */ events[6] |= 0x04; /* User Confirmation Request */ events[6] |= 0x08; /* User Passkey Request */ events[6] |= 0x10; /* Remote OOB Data Request */ events[6] |= 0x20; /* Simple Pairing Complete */ events[7] |= 0x04; /* User Passkey Notification */ events[7] |= 0x08; /* Keypress Notification */ events[7] |= 0x10; /* Remote Host Supported * Features Notification */ } if (lmp_le_capable(hdev)) events[7] |= 0x20; /* LE Meta-Event */ return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_MASK, sizeof(events), events, HCI_CMD_TIMEOUT); } static int hci_read_stored_link_key_sync(struct hci_dev *hdev) { struct hci_cp_read_stored_link_key cp; if (!(hdev->commands[6] & 0x20) || test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) return 0; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, BDADDR_ANY); cp.read_all = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_READ_STORED_LINK_KEY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_setup_link_policy_sync(struct hci_dev *hdev) { struct hci_cp_write_def_link_policy cp; u16 link_policy = 0; if (!(hdev->commands[5] & 0x10)) return 0; memset(&cp, 0, sizeof(cp)); if (lmp_rswitch_capable(hdev)) link_policy |= HCI_LP_RSWITCH; if (lmp_hold_capable(hdev)) link_policy |= HCI_LP_HOLD; if (lmp_sniff_capable(hdev)) link_policy |= HCI_LP_SNIFF; if (lmp_park_capable(hdev)) link_policy |= HCI_LP_PARK; cp.policy = cpu_to_le16(link_policy); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_read_page_scan_activity_sync(struct hci_dev *hdev) { if (!(hdev->commands[8] & 0x01)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev) { if (!(hdev->commands[18] & 0x04) || !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_read_page_scan_type_sync(struct hci_dev *hdev) { /* Some older Broadcom based Bluetooth 1.2 controllers do not * support the Read Page Scan Type command. Check support for * this command in the bit mask of supported commands. */ if (!(hdev->commands[13] & 0x01)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read features beyond page 1 if available */ static int hci_read_local_ext_features_all_sync(struct hci_dev *hdev) { u8 page; int err; if (!lmp_ext_feat_capable(hdev)) return 0; for (page = 2; page < HCI_MAX_PAGES && page <= hdev->max_page; page++) { err = hci_read_local_ext_features_sync(hdev, page); if (err) return err; } return 0; } /* HCI Controller init stage 3 command sequence */ static const struct hci_init_stage hci_init3[] = { /* HCI_OP_SET_EVENT_MASK */ HCI_INIT(hci_set_event_mask_sync), /* HCI_OP_READ_STORED_LINK_KEY */ HCI_INIT(hci_read_stored_link_key_sync), /* HCI_OP_WRITE_DEF_LINK_POLICY */ HCI_INIT(hci_setup_link_policy_sync), /* HCI_OP_READ_PAGE_SCAN_ACTIVITY */ HCI_INIT(hci_read_page_scan_activity_sync), /* HCI_OP_READ_DEF_ERR_DATA_REPORTING */ HCI_INIT(hci_read_def_err_data_reporting_sync), /* HCI_OP_READ_PAGE_SCAN_TYPE */ HCI_INIT(hci_read_page_scan_type_sync), /* HCI_OP_READ_LOCAL_EXT_FEATURES */ HCI_INIT(hci_read_local_ext_features_all_sync), {} }; static int hci_le_set_event_mask_sync(struct hci_dev *hdev) { u8 events[8]; if (!lmp_le_capable(hdev)) return 0; memset(events, 0, sizeof(events)); if (hdev->le_features[0] & HCI_LE_ENCRYPTION) events[0] |= 0x10; /* LE Long Term Key Request */ /* If controller supports the Connection Parameters Request * Link Layer Procedure, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_CONN_PARAM_REQ_PROC) /* LE Remote Connection Parameter Request */ events[0] |= 0x20; /* If the controller supports the Data Length Extension * feature, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ /* If the controller supports LL Privacy feature or LE Extended Adv, * enable the corresponding event. */ if (use_enhanced_conn_complete(hdev)) events[1] |= 0x02; /* LE Enhanced Connection Complete */ /* If the controller supports Extended Scanner Filter * Policies, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY) events[1] |= 0x04; /* LE Direct Advertising Report */ /* If the controller supports Channel Selection Algorithm #2 * feature, enable the corresponding event. */ if (hdev->le_features[1] & HCI_LE_CHAN_SEL_ALG2) events[2] |= 0x08; /* LE Channel Selection Algorithm */ /* If the controller supports the LE Set Scan Enable command, * enable the corresponding advertising report event. */ if (hdev->commands[26] & 0x08) events[0] |= 0x02; /* LE Advertising Report */ /* If the controller supports the LE Create Connection * command, enable the corresponding event. */ if (hdev->commands[26] & 0x10) events[0] |= 0x01; /* LE Connection Complete */ /* If the controller supports the LE Connection Update * command, enable the corresponding event. */ if (hdev->commands[27] & 0x04) events[0] |= 0x04; /* LE Connection Update Complete */ /* If the controller supports the LE Read Remote Used Features * command, enable the corresponding event. */ if (hdev->commands[27] & 0x20) /* LE Read Remote Used Features Complete */ events[0] |= 0x08; /* If the controller supports the LE Read Local P-256 * Public Key command, enable the corresponding event. */ if (hdev->commands[34] & 0x02) /* LE Read Local P-256 Public Key Complete */ events[0] |= 0x80; /* If the controller supports the LE Generate DHKey * command, enable the corresponding event. */ if (hdev->commands[34] & 0x04) events[1] |= 0x01; /* LE Generate DHKey Complete */ /* If the controller supports the LE Set Default PHY or * LE Set PHY commands, enable the corresponding event. */ if (hdev->commands[35] & (0x20 | 0x40)) events[1] |= 0x08; /* LE PHY Update Complete */ /* If the controller supports LE Set Extended Scan Parameters * and LE Set Extended Scan Enable commands, enable the * corresponding event. */ if (use_ext_scan(hdev)) events[1] |= 0x10; /* LE Extended Advertising Report */ /* If the controller supports the LE Extended Advertising * command, enable the corresponding event. */ if (ext_adv_capable(hdev)) events[2] |= 0x02; /* LE Advertising Set Terminated */ if (cis_capable(hdev)) { events[3] |= 0x01; /* LE CIS Established */ if (cis_peripheral_capable(hdev)) events[3] |= 0x02; /* LE CIS Request */ } if (bis_capable(hdev)) { events[1] |= 0x20; /* LE PA Report */ events[1] |= 0x40; /* LE PA Sync Established */ events[3] |= 0x04; /* LE Create BIG Complete */ events[3] |= 0x08; /* LE Terminate BIG Complete */ events[3] |= 0x10; /* LE BIG Sync Established */ events[3] |= 0x20; /* LE BIG Sync Loss */ events[4] |= 0x02; /* LE BIG Info Advertising Report */ } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events, HCI_CMD_TIMEOUT); } /* Read LE Advertising Channel TX Power */ static int hci_le_read_adv_tx_power_sync(struct hci_dev *hdev) { if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) { /* HCI TS spec forbids mixing of legacy and extended * advertising commands wherein READ_ADV_TX_POWER is * also included. So do not call it if extended adv * is supported otherwise controller will return * COMMAND_DISALLOWED for extended commands. */ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL, HCI_CMD_TIMEOUT); } return 0; } /* Read LE Min/Max Tx Power*/ static int hci_le_read_tx_power_sync(struct hci_dev *hdev) { if (!(hdev->commands[38] & 0x80) || test_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_TRANSMIT_POWER, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Accept List Size */ static int hci_le_read_accept_list_size_sync(struct hci_dev *hdev) { if (!(hdev->commands[26] & 0x40)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ACCEPT_LIST_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Resolving List Size */ static int hci_le_read_resolv_list_size_sync(struct hci_dev *hdev) { if (!(hdev->commands[34] & 0x40)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_RESOLV_LIST_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Clear LE Resolving List */ static int hci_le_clear_resolv_list_sync(struct hci_dev *hdev) { if (!(hdev->commands[34] & 0x20)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL, HCI_CMD_TIMEOUT); } /* Set RPA timeout */ static int hci_le_set_rpa_timeout_sync(struct hci_dev *hdev) { __le16 timeout = cpu_to_le16(hdev->rpa_timeout); if (!(hdev->commands[35] & 0x04) || test_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RPA_TIMEOUT, sizeof(timeout), &timeout, HCI_CMD_TIMEOUT); } /* Read LE Maximum Data Length */ static int hci_le_read_max_data_len_sync(struct hci_dev *hdev) { if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Suggested Default Data Length */ static int hci_le_read_def_data_len_sync(struct hci_dev *hdev) { if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Number of Supported Advertising Sets */ static int hci_le_read_num_support_adv_sets_sync(struct hci_dev *hdev) { if (!ext_adv_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS, 0, NULL, HCI_CMD_TIMEOUT); } /* Write LE Host Supported */ static int hci_set_le_support_sync(struct hci_dev *hdev) { struct hci_cp_write_le_host_supported cp; /* LE-only devices do not support explicit enablement */ if (!lmp_bredr_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { cp.le = 0x01; cp.simul = 0x00; } if (cp.le == lmp_host_le_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* LE Set Host Feature */ static int hci_le_set_host_feature_sync(struct hci_dev *hdev) { struct hci_cp_le_set_host_feature cp; if (!cis_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); /* Connected Isochronous Channels (Host Support) */ cp.bit_number = 32; cp.bit_value = 1; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* LE Controller init stage 3 command sequence */ static const struct hci_init_stage le_init3[] = { /* HCI_OP_LE_SET_EVENT_MASK */ HCI_INIT(hci_le_set_event_mask_sync), /* HCI_OP_LE_READ_ADV_TX_POWER */ HCI_INIT(hci_le_read_adv_tx_power_sync), /* HCI_OP_LE_READ_TRANSMIT_POWER */ HCI_INIT(hci_le_read_tx_power_sync), /* HCI_OP_LE_READ_ACCEPT_LIST_SIZE */ HCI_INIT(hci_le_read_accept_list_size_sync), /* HCI_OP_LE_CLEAR_ACCEPT_LIST */ HCI_INIT(hci_le_clear_accept_list_sync), /* HCI_OP_LE_READ_RESOLV_LIST_SIZE */ HCI_INIT(hci_le_read_resolv_list_size_sync), /* HCI_OP_LE_CLEAR_RESOLV_LIST */ HCI_INIT(hci_le_clear_resolv_list_sync), /* HCI_OP_LE_SET_RPA_TIMEOUT */ HCI_INIT(hci_le_set_rpa_timeout_sync), /* HCI_OP_LE_READ_MAX_DATA_LEN */ HCI_INIT(hci_le_read_max_data_len_sync), /* HCI_OP_LE_READ_DEF_DATA_LEN */ HCI_INIT(hci_le_read_def_data_len_sync), /* HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS */ HCI_INIT(hci_le_read_num_support_adv_sets_sync), /* HCI_OP_WRITE_LE_HOST_SUPPORTED */ HCI_INIT(hci_set_le_support_sync), /* HCI_OP_LE_SET_HOST_FEATURE */ HCI_INIT(hci_le_set_host_feature_sync), {} }; static int hci_init3_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_init_stage_sync(hdev, hci_init3); if (err) return err; if (lmp_le_capable(hdev)) return hci_init_stage_sync(hdev, le_init3); return 0; } static int hci_delete_stored_link_key_sync(struct hci_dev *hdev) { struct hci_cp_delete_stored_link_key cp; /* Some Broadcom based Bluetooth controllers do not support the * Delete Stored Link Key command. They are clearly indicating its * absence in the bit mask of supported commands. * * Check the supported commands and only if the command is marked * as supported send it. If not supported assume that the controller * does not have actual support for stored link keys which makes this * command redundant anyway. * * Some controllers indicate that they support handling deleting * stored link keys, but they don't. The quirk lets a driver * just disable this command. */ if (!(hdev->commands[6] & 0x80) || test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) return 0; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, BDADDR_ANY); cp.delete_all = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_DELETE_STORED_LINK_KEY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_event_mask_page_2_sync(struct hci_dev *hdev) { u8 events[8] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; bool changed = false; /* Set event mask page 2 if the HCI command for it is supported */ if (!(hdev->commands[22] & 0x04)) return 0; /* If Connectionless Peripheral Broadcast central role is supported * enable all necessary events for it. */ if (lmp_cpb_central_capable(hdev)) { events[1] |= 0x40; /* Triggered Clock Capture */ events[1] |= 0x80; /* Synchronization Train Complete */ events[2] |= 0x08; /* Truncated Page Complete */ events[2] |= 0x20; /* CPB Channel Map Change */ changed = true; } /* If Connectionless Peripheral Broadcast peripheral role is supported * enable all necessary events for it. */ if (lmp_cpb_peripheral_capable(hdev)) { events[2] |= 0x01; /* Synchronization Train Received */ events[2] |= 0x02; /* CPB Receive */ events[2] |= 0x04; /* CPB Timeout */ events[2] |= 0x10; /* Peripheral Page Response Timeout */ changed = true; } /* Enable Authenticated Payload Timeout Expired event if supported */ if (lmp_ping_capable(hdev) || hdev->le_features[0] & HCI_LE_PING) { events[2] |= 0x80; changed = true; } /* Some Broadcom based controllers indicate support for Set Event * Mask Page 2 command, but then actually do not support it. Since * the default value is all bits set to zero, the command is only * required if the event mask has to be changed. In case no change * to the event mask is needed, skip this command. */ if (!changed) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_MASK_PAGE_2, sizeof(events), events, HCI_CMD_TIMEOUT); } /* Read local codec list if the HCI command is supported */ static int hci_read_local_codecs_sync(struct hci_dev *hdev) { if (hdev->commands[45] & 0x04) hci_read_supported_codecs_v2(hdev); else if (hdev->commands[29] & 0x20) hci_read_supported_codecs(hdev); return 0; } /* Read local pairing options if the HCI command is supported */ static int hci_read_local_pairing_opts_sync(struct hci_dev *hdev) { if (!(hdev->commands[41] & 0x08)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_PAIRING_OPTS, 0, NULL, HCI_CMD_TIMEOUT); } /* Get MWS transport configuration if the HCI command is supported */ static int hci_get_mws_transport_config_sync(struct hci_dev *hdev) { if (!mws_transport_config_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_GET_MWS_TRANSPORT_CONFIG, 0, NULL, HCI_CMD_TIMEOUT); } /* Check for Synchronization Train support */ static int hci_read_sync_train_params_sync(struct hci_dev *hdev) { if (!lmp_sync_train_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_SYNC_TRAIN_PARAMS, 0, NULL, HCI_CMD_TIMEOUT); } /* Enable Secure Connections if supported and configured */ static int hci_write_sc_support_1_sync(struct hci_dev *hdev) { u8 support = 0x01; if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED) || !bredr_sc_enabled(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SC_SUPPORT, sizeof(support), &support, HCI_CMD_TIMEOUT); } /* Set erroneous data reporting if supported to the wideband speech * setting value */ static int hci_set_err_data_report_sync(struct hci_dev *hdev) { struct hci_cp_write_def_err_data_reporting cp; bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED); if (!(hdev->commands[18] & 0x08) || !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) return 0; if (enabled == hdev->err_data_reporting) return 0; memset(&cp, 0, sizeof(cp)); cp.err_data_reporting = enabled ? ERR_DATA_REPORTING_ENABLED : ERR_DATA_REPORTING_DISABLED; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static const struct hci_init_stage hci_init4[] = { /* HCI_OP_DELETE_STORED_LINK_KEY */ HCI_INIT(hci_delete_stored_link_key_sync), /* HCI_OP_SET_EVENT_MASK_PAGE_2 */ HCI_INIT(hci_set_event_mask_page_2_sync), /* HCI_OP_READ_LOCAL_CODECS */ HCI_INIT(hci_read_local_codecs_sync), /* HCI_OP_READ_LOCAL_PAIRING_OPTS */ HCI_INIT(hci_read_local_pairing_opts_sync), /* HCI_OP_GET_MWS_TRANSPORT_CONFIG */ HCI_INIT(hci_get_mws_transport_config_sync), /* HCI_OP_READ_SYNC_TRAIN_PARAMS */ HCI_INIT(hci_read_sync_train_params_sync), /* HCI_OP_WRITE_SC_SUPPORT */ HCI_INIT(hci_write_sc_support_1_sync), /* HCI_OP_WRITE_DEF_ERR_DATA_REPORTING */ HCI_INIT(hci_set_err_data_report_sync), {} }; /* Set Suggested Default Data Length to maximum if supported */ static int hci_le_set_write_def_data_len_sync(struct hci_dev *hdev) { struct hci_cp_le_write_def_data_len cp; if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)) return 0; memset(&cp, 0, sizeof(cp)); cp.tx_len = cpu_to_le16(hdev->le_max_tx_len); cp.tx_time = cpu_to_le16(hdev->le_max_tx_time); return __hci_cmd_sync_status(hdev, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Set Default PHY parameters if command is supported, enables all supported * PHYs according to the LE Features bits. */ static int hci_le_set_default_phy_sync(struct hci_dev *hdev) { struct hci_cp_le_set_default_phy cp; if (!(hdev->commands[35] & 0x20)) { /* If the command is not supported it means only 1M PHY is * supported. */ hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; return 0; } memset(&cp, 0, sizeof(cp)); cp.all_phys = 0x00; cp.tx_phys = HCI_LE_SET_PHY_1M; cp.rx_phys = HCI_LE_SET_PHY_1M; /* Enables 2M PHY if supported */ if (le_2m_capable(hdev)) { cp.tx_phys |= HCI_LE_SET_PHY_2M; cp.rx_phys |= HCI_LE_SET_PHY_2M; } /* Enables Coded PHY if supported */ if (le_coded_capable(hdev)) { cp.tx_phys |= HCI_LE_SET_PHY_CODED; cp.rx_phys |= HCI_LE_SET_PHY_CODED; } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static const struct hci_init_stage le_init4[] = { /* HCI_OP_LE_WRITE_DEF_DATA_LEN */ HCI_INIT(hci_le_set_write_def_data_len_sync), /* HCI_OP_LE_SET_DEFAULT_PHY */ HCI_INIT(hci_le_set_default_phy_sync), {} }; static int hci_init4_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_init_stage_sync(hdev, hci_init4); if (err) return err; if (lmp_le_capable(hdev)) return hci_init_stage_sync(hdev, le_init4); return 0; } static int hci_init_sync(struct hci_dev *hdev) { int err; err = hci_init1_sync(hdev); if (err < 0) return err; if (hci_dev_test_flag(hdev, HCI_SETUP)) hci_debugfs_create_basic(hdev); err = hci_init2_sync(hdev); if (err < 0) return err; err = hci_init3_sync(hdev); if (err < 0) return err; err = hci_init4_sync(hdev); if (err < 0) return err; /* This function is only called when the controller is actually in * configured state. When the controller is marked as unconfigured, * this initialization procedure is not run. * * It means that it is possible that a controller runs through its * setup phase and then discovers missing settings. If that is the * case, then this function will not be called. It then will only * be called during the config phase. * * So only when in setup phase or config phase, create the debugfs * entries and register the SMP channels. */ if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) return 0; if (hci_dev_test_and_set_flag(hdev, HCI_DEBUGFS_CREATED)) return 0; hci_debugfs_create_common(hdev); if (lmp_bredr_capable(hdev)) hci_debugfs_create_bredr(hdev); if (lmp_le_capable(hdev)) hci_debugfs_create_le(hdev); return 0; } #define HCI_QUIRK_BROKEN(_quirk, _desc) { HCI_QUIRK_BROKEN_##_quirk, _desc } static const struct { unsigned long quirk; const char *desc; } hci_broken_table[] = { HCI_QUIRK_BROKEN(LOCAL_COMMANDS, "HCI Read Local Supported Commands not supported"), HCI_QUIRK_BROKEN(STORED_LINK_KEY, "HCI Delete Stored Link Key command is advertised, " "but not supported."), HCI_QUIRK_BROKEN(ERR_DATA_REPORTING, "HCI Read Default Erroneous Data Reporting command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(READ_TRANSMIT_POWER, "HCI Read Transmit Power Level command is advertised, " "but not supported."), HCI_QUIRK_BROKEN(FILTER_CLEAR_ALL, "HCI Set Event Filter command not supported."), HCI_QUIRK_BROKEN(ENHANCED_SETUP_SYNC_CONN, "HCI Enhanced Setup Synchronous Connection command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(SET_RPA_TIMEOUT, "HCI LE Set Random Private Address Timeout command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(EXT_CREATE_CONN, "HCI LE Extended Create Connection command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(WRITE_AUTH_PAYLOAD_TIMEOUT, "HCI WRITE AUTH PAYLOAD TIMEOUT command leads " "to unexpected SMP errors when pairing " "and will not be used."), HCI_QUIRK_BROKEN(LE_CODED, "HCI LE Coded PHY feature bit is set, " "but its usage is not supported.") }; /* This function handles hdev setup stage: * * Calls hdev->setup * Setup address if HCI_QUIRK_USE_BDADDR_PROPERTY is set. */ static int hci_dev_setup_sync(struct hci_dev *hdev) { int ret = 0; bool invalid_bdaddr; size_t i; if (!hci_dev_test_flag(hdev, HCI_SETUP) && !test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) return 0; bt_dev_dbg(hdev, ""); hci_sock_dev_event(hdev, HCI_DEV_SETUP); if (hdev->setup) ret = hdev->setup(hdev); for (i = 0; i < ARRAY_SIZE(hci_broken_table); i++) { if (test_bit(hci_broken_table[i].quirk, &hdev->quirks)) bt_dev_warn(hdev, "%s", hci_broken_table[i].desc); } /* The transport driver can set the quirk to mark the * BD_ADDR invalid before creating the HCI device or in * its setup callback. */ invalid_bdaddr = test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) || test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); if (!ret) { if (test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks) && !bacmp(&hdev->public_addr, BDADDR_ANY)) hci_dev_get_bd_addr_from_property(hdev); if (invalid_bdaddr && bacmp(&hdev->public_addr, BDADDR_ANY) && hdev->set_bdaddr) { ret = hdev->set_bdaddr(hdev, &hdev->public_addr); if (!ret) invalid_bdaddr = false; } } /* The transport driver can set these quirks before * creating the HCI device or in its setup callback. * * For the invalid BD_ADDR quirk it is possible that * it becomes a valid address if the bootloader does * provide it (see above). * * In case any of them is set, the controller has to * start up as unconfigured. */ if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || invalid_bdaddr) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* For an unconfigured controller it is required to * read at least the version information provided by * the Read Local Version Information command. * * If the set_bdaddr driver callback is provided, then * also the original Bluetooth public device address * will be read using the Read BD Address command. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return hci_unconf_init_sync(hdev); return ret; } /* This function handles hdev init stage: * * Calls hci_dev_setup_sync to perform setup stage * Calls hci_init_sync to perform HCI command init sequence */ static int hci_dev_init_sync(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); atomic_set(&hdev->cmd_cnt, 1); set_bit(HCI_INIT, &hdev->flags); ret = hci_dev_setup_sync(hdev); if (hci_dev_test_flag(hdev, HCI_CONFIG)) { /* If public address change is configured, ensure that * the address gets programmed. If the driver does not * support changing the public address, fail the power * on procedure. */ if (bacmp(&hdev->public_addr, BDADDR_ANY) && hdev->set_bdaddr) ret = hdev->set_bdaddr(hdev, &hdev->public_addr); else ret = -EADDRNOTAVAIL; } if (!ret) { if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { ret = hci_init_sync(hdev); if (!ret && hdev->post_init) ret = hdev->post_init(hdev); } } /* If the HCI Reset command is clearing all diagnostic settings, * then they need to be reprogrammed after the init procedure * completed. */ if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag) ret = hdev->set_diag(hdev, true); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { msft_do_open(hdev); aosp_do_open(hdev); } clear_bit(HCI_INIT, &hdev->flags); return ret; } int hci_dev_open_sync(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { ret = -ENODEV; goto done; } if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { /* Check for rfkill but allow the HCI setup stage to * proceed (which in itself doesn't cause any RF activity). */ if (hci_dev_test_flag(hdev, HCI_RFKILLED)) { ret = -ERFKILL; goto done; } /* Check for valid public address or a configured static * random address, but let the HCI setup proceed to * be able to determine if there is a public address * or not. * * In case of user channel usage, it is not important * if a public address or static random address is * available. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY)) { ret = -EADDRNOTAVAIL; goto done; } } if (test_bit(HCI_UP, &hdev->flags)) { ret = -EALREADY; goto done; } if (hdev->open(hdev)) { ret = -EIO; goto done; } hci_devcd_reset(hdev); set_bit(HCI_RUNNING, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_OPEN); ret = hci_dev_init_sync(hdev); if (!ret) { hci_dev_hold(hdev); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); hci_adv_instances_set_rpa_expired(hdev, true); set_bit(HCI_UP, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_UP); hci_leds_update_powered(hdev, true); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG) && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT)) { ret = hci_powered_update_sync(hdev); mgmt_power_on(hdev, ret); } } else { /* Init failed, cleanup */ flush_work(&hdev->tx_work); /* Since hci_rx_work() is possible to awake new cmd_work * it should be flushed first to avoid unexpected call of * hci_cmd_work() */ flush_work(&hdev->rx_work); flush_work(&hdev->cmd_work); skb_queue_purge(&hdev->cmd_q); skb_queue_purge(&hdev->rx_q); if (hdev->flush) hdev->flush(hdev); if (hdev->sent_cmd) { cancel_delayed_work_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } if (hdev->req_skb) { kfree_skb(hdev->req_skb); hdev->req_skb = NULL; } clear_bit(HCI_RUNNING, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_CLOSE); hdev->close(hdev); hdev->flags &= BIT(HCI_RAW); } done: return ret; } /* This function requires the caller holds hdev->lock */ static void hci_pend_le_actions_clear(struct hci_dev *hdev) { struct hci_conn_params *p; list_for_each_entry(p, &hdev->le_conn_params, list) { hci_pend_le_list_del_init(p); if (p->conn) { hci_conn_drop(p->conn); hci_conn_put(p->conn); p->conn = NULL; } } BT_DBG("All LE pending actions cleared"); } static int hci_dev_shutdown(struct hci_dev *hdev) { int err = 0; /* Similar to how we first do setup and then set the exclusive access * bit for userspace, we must first unset userchannel and then clean up. * Otherwise, the kernel can't properly use the hci channel to clean up * the controller (some shutdown routines require sending additional * commands to the controller for example). */ bool was_userchannel = hci_dev_test_and_clear_flag(hdev, HCI_USER_CHANNEL); if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && test_bit(HCI_UP, &hdev->flags)) { /* Execute vendor specific shutdown routine */ if (hdev->shutdown) err = hdev->shutdown(hdev); } if (was_userchannel) hci_dev_set_flag(hdev, HCI_USER_CHANNEL); return err; } int hci_dev_close_sync(struct hci_dev *hdev) { bool auto_off; int err = 0; bt_dev_dbg(hdev, ""); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { disable_delayed_work(&hdev->power_off); disable_delayed_work(&hdev->ncmd_timer); disable_delayed_work(&hdev->le_scan_disable); } else { cancel_delayed_work(&hdev->power_off); cancel_delayed_work(&hdev->ncmd_timer); cancel_delayed_work(&hdev->le_scan_disable); } hci_cmd_sync_cancel_sync(hdev, ENODEV); cancel_interleave_scan(hdev); if (hdev->adv_instance_timeout) { cancel_delayed_work_sync(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } err = hci_dev_shutdown(hdev); if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); return err; } hci_leds_update_powered(hdev, false); /* Flush RX and TX works */ flush_work(&hdev->tx_work); flush_work(&hdev->rx_work); if (hdev->discov_timeout > 0) { hdev->discov_timeout = 0; hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); } if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) cancel_delayed_work(&hdev->service_cache); if (hci_dev_test_flag(hdev, HCI_MGMT)) { struct adv_info *adv_instance; cancel_delayed_work_sync(&hdev->rpa_expired); list_for_each_entry(adv_instance, &hdev->adv_instances, list) cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); } /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ drain_workqueue(hdev->workqueue); hci_dev_lock(hdev); hci_discovery_set_state(hdev, DISCOVERY_STOPPED); auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); if (!auto_off && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT)) __mgmt_power_off(hdev); hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); hci_conn_hash_flush(hdev); /* Prevent data races on hdev->smp_data or hdev->smp_bredr_data */ smp_unregister(hdev); hci_dev_unlock(hdev); hci_sock_dev_event(hdev, HCI_DEV_DOWN); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { aosp_do_close(hdev); msft_do_close(hdev); } if (hdev->flush) hdev->flush(hdev); /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) && !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); hci_reset_sync(hdev); clear_bit(HCI_INIT, &hdev->flags); } /* flush cmd work */ flush_work(&hdev->cmd_work); /* Drop queues */ skb_queue_purge(&hdev->rx_q); skb_queue_purge(&hdev->cmd_q); skb_queue_purge(&hdev->raw_q); /* Drop last sent command */ if (hdev->sent_cmd) { cancel_delayed_work_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } /* Drop last request */ if (hdev->req_skb) { kfree_skb(hdev->req_skb); hdev->req_skb = NULL; } clear_bit(HCI_RUNNING, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_CLOSE); /* After this point our queues are empty and no tasks are scheduled. */ hdev->close(hdev); /* Clear flags */ hdev->flags &= BIT(HCI_RAW); hci_dev_clear_volatile_flags(hdev); memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); bacpy(&hdev->random_addr, BDADDR_ANY); hci_codec_list_clear(&hdev->local_codecs); hci_dev_put(hdev); return err; } /* This function perform power on HCI command sequence as follows: * * If controller is already up (HCI_UP) performs hci_powered_update_sync * sequence otherwise run hci_dev_open_sync which will follow with * hci_powered_update_sync after the init sequence is completed. */ static int hci_power_on_sync(struct hci_dev *hdev) { int err; if (test_bit(HCI_UP, &hdev->flags) && hci_dev_test_flag(hdev, HCI_MGMT) && hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { cancel_delayed_work(&hdev->power_off); return hci_powered_update_sync(hdev); } err = hci_dev_open_sync(hdev); if (err < 0) return err; /* During the HCI setup phase, a few error conditions are * ignored and they need to be checked now. If they are still * valid, it is important to return the device back off. */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_close_sync(hdev); } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) { queue_delayed_work(hdev->req_workqueue, &hdev->power_off, HCI_AUTO_OFF_TIMEOUT); } if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) { /* For unconfigured devices, set the HCI_RAW flag * so that userspace can easily identify them. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) set_bit(HCI_RAW, &hdev->flags); /* For fully configured devices, this will send * the Index Added event. For unconfigured devices, * it will send Unconfigued Index Added event. * * Devices with HCI_QUIRK_RAW_DEVICE are ignored * and no event will be send. */ mgmt_index_added(hdev); } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) { /* When the controller is now configured, then it * is important to clear the HCI_RAW flag. */ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) clear_bit(HCI_RAW, &hdev->flags); /* Powering on the controller with HCI_CONFIG set only * happens with the transition from unconfigured to * configured. This will send the Index Added event. */ mgmt_index_added(hdev); } return 0; } static int hci_remote_name_cancel_sync(struct hci_dev *hdev, bdaddr_t *addr) { struct hci_cp_remote_name_req_cancel cp; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, addr); return __hci_cmd_sync_status(hdev, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_stop_discovery_sync(struct hci_dev *hdev) { struct discovery_state *d = &hdev->discovery; struct inquiry_entry *e; int err; bt_dev_dbg(hdev, "state %u", hdev->discovery.state); if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) { if (test_bit(HCI_INQUIRY, &hdev->flags)) { err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); if (err) return err; } if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { cancel_delayed_work(&hdev->le_scan_disable); err = hci_scan_disable_sync(hdev); if (err) return err; } } else { err = hci_scan_disable_sync(hdev); if (err) return err; } /* Resume advertising if it was paused */ if (use_ll_privacy(hdev)) hci_resume_advertising_sync(hdev); /* No further actions needed for LE-only discovery */ if (d->type == DISCOV_TYPE_LE) return 0; if (d->state == DISCOVERY_RESOLVING || d->state == DISCOVERY_STOPPING) { e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_PENDING); if (!e) return 0; /* Ignore cancel errors since it should interfere with stopping * of the discovery. */ hci_remote_name_cancel_sync(hdev, &e->data.bdaddr); } return 0; } static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_disconnect cp; if (test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) { /* This is a BIS connection, hci_conn_del will * do the necessary cleanup. */ hci_dev_lock(hdev); hci_conn_failed(conn, reason); hci_dev_unlock(hdev); return 0; } memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.reason = reason; /* Wait for HCI_EV_DISCONN_COMPLETE, not HCI_EV_CMD_STATUS, when the * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is * used when suspending or powering off, where we don't want to wait * for the peer's response. */ if (reason != HCI_ERROR_REMOTE_POWER_OFF) return __hci_cmd_sync_status_sk(hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp, HCI_EV_DISCONN_COMPLETE, HCI_CMD_TIMEOUT, NULL); return __hci_cmd_sync_status(hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { /* Return reason if scanning since the connection shall probably be * cleanup directly. */ if (test_bit(HCI_CONN_SCANNING, &conn->flags)) return reason; if (conn->role == HCI_ROLE_SLAVE || test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { if (conn->type == LE_LINK) return hci_le_connect_cancel_sync(hdev, conn, reason); if (conn->type == ISO_LINK) { /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 1857: * * If this command is issued for a CIS on the Central and the * CIS is successfully terminated before being established, * then an HCI_LE_CIS_Established event shall also be sent for * this CIS with the Status Operation Cancelled by Host (0x44). */ if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) return hci_disconnect_sync(hdev, conn, reason); /* CIS with no Create CIS sent have nothing to cancel */ if (bacmp(&conn->dst, BDADDR_ANY)) return HCI_ERROR_LOCAL_HOST_TERM; /* There is no way to cancel a BIS without terminating the BIG * which is done later on connection cleanup. */ return 0; } if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; /* Wait for HCI_EV_CONN_COMPLETE, not HCI_EV_CMD_STATUS, when the * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is * used when suspending or powering off, where we don't want to wait * for the peer's response. */ if (reason != HCI_ERROR_REMOTE_POWER_OFF) return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN_CANCEL, 6, &conn->dst, HCI_EV_CONN_COMPLETE, HCI_CMD_TIMEOUT, NULL); return __hci_cmd_sync_status(hdev, HCI_OP_CREATE_CONN_CANCEL, 6, &conn->dst, HCI_CMD_TIMEOUT); } static int hci_reject_sco_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_reject_sync_conn_req cp; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.reason = reason; /* SCO rejection has its own limited set of * allowed error values (0x0D-0x0F). */ if (reason < 0x0d || reason > 0x0f) cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; return __hci_cmd_sync_status(hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_reject_cis_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_le_reject_cis cp; memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.reason = reason; return __hci_cmd_sync_status(hdev, HCI_OP_LE_REJECT_CIS, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_reject_conn_req cp; if (conn->type == ISO_LINK) return hci_le_reject_cis_sync(hdev, conn, reason); if (conn->type == SCO_LINK || conn->type == ESCO_LINK) return hci_reject_sco_sync(hdev, conn, reason); memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.reason = reason; return __hci_cmd_sync_status(hdev, HCI_OP_REJECT_CONN_REQ, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { int err = 0; u16 handle = conn->handle; bool disconnect = false; struct hci_conn *c; switch (conn->state) { case BT_CONNECTED: case BT_CONFIG: err = hci_disconnect_sync(hdev, conn, reason); break; case BT_CONNECT: err = hci_connect_cancel_sync(hdev, conn, reason); break; case BT_CONNECT2: err = hci_reject_conn_sync(hdev, conn, reason); break; case BT_OPEN: case BT_BOUND: break; default: disconnect = true; break; } hci_dev_lock(hdev); /* Check if the connection has been cleaned up concurrently */ c = hci_conn_hash_lookup_handle(hdev, handle); if (!c || c != conn) { err = 0; goto unlock; } /* Cleanup hci_conn object if it cannot be cancelled as it * likelly means the controller and host stack are out of sync * or in case of LE it was still scanning so it can be cleanup * safely. */ if (disconnect) { conn->state = BT_CLOSED; hci_disconn_cfm(conn, reason); hci_conn_del(conn); } else { hci_conn_failed(conn, reason); } unlock: hci_dev_unlock(hdev); return err; } static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason) { struct list_head *head = &hdev->conn_hash.list; struct hci_conn *conn; rcu_read_lock(); while ((conn = list_first_or_null_rcu(head, struct hci_conn, list))) { /* Make sure the connection is not freed while unlocking */ conn = hci_conn_get(conn); rcu_read_unlock(); /* Disregard possible errors since hci_conn_del shall have been * called even in case of errors had occurred since it would * then cause hci_conn_failed to be called which calls * hci_conn_del internally. */ hci_abort_conn_sync(hdev, conn, reason); hci_conn_put(conn); rcu_read_lock(); } rcu_read_unlock(); return 0; } /* This function perform power off HCI command sequence as follows: * * Clear Advertising * Stop Discovery * Disconnect all connections * hci_dev_close_sync */ static int hci_power_off_sync(struct hci_dev *hdev) { int err; /* If controller is already down there is nothing to do */ if (!test_bit(HCI_UP, &hdev->flags)) return 0; hci_dev_set_flag(hdev, HCI_POWERING_DOWN); if (test_bit(HCI_ISCAN, &hdev->flags) || test_bit(HCI_PSCAN, &hdev->flags)) { err = hci_write_scan_enable_sync(hdev, 0x00); if (err) goto out; } err = hci_clear_adv_sync(hdev, NULL, false); if (err) goto out; err = hci_stop_discovery_sync(hdev); if (err) goto out; /* Terminated due to Power Off */ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); if (err) goto out; err = hci_dev_close_sync(hdev); out: hci_dev_clear_flag(hdev, HCI_POWERING_DOWN); return err; } int hci_set_powered_sync(struct hci_dev *hdev, u8 val) { if (val) return hci_power_on_sync(hdev); return hci_power_off_sync(hdev); } static int hci_write_iac_sync(struct hci_dev *hdev) { struct hci_cp_write_current_iac_lap cp; if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) return 0; memset(&cp, 0, sizeof(cp)); if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { /* Limited discoverable mode */ cp.num_iac = min_t(u8, hdev->num_iac, 2); cp.iac_lap[0] = 0x00; /* LIAC */ cp.iac_lap[1] = 0x8b; cp.iac_lap[2] = 0x9e; cp.iac_lap[3] = 0x33; /* GIAC */ cp.iac_lap[4] = 0x8b; cp.iac_lap[5] = 0x9e; } else { /* General discoverable mode */ cp.num_iac = 1; cp.iac_lap[0] = 0x33; /* GIAC */ cp.iac_lap[1] = 0x8b; cp.iac_lap[2] = 0x9e; } return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CURRENT_IAC_LAP, (cp.num_iac * 3) + 1, &cp, HCI_CMD_TIMEOUT); } int hci_update_discoverable_sync(struct hci_dev *hdev) { int err = 0; if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = hci_write_iac_sync(hdev); if (err) return err; err = hci_update_scan_sync(hdev); if (err) return err; err = hci_update_class_sync(hdev); if (err) return err; } /* Advertising instances don't use the global discoverable setting, so * only update AD if advertising was enabled using Set Advertising. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) { err = hci_update_adv_data_sync(hdev, 0x00); if (err) return err; /* Discoverable mode affects the local advertising * address in limited privacy mode. */ if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) { if (ext_adv_capable(hdev)) err = hci_start_ext_adv_sync(hdev, 0x00); else err = hci_enable_advertising_sync(hdev); } } return err; } static int update_discoverable_sync(struct hci_dev *hdev, void *data) { return hci_update_discoverable_sync(hdev); } int hci_update_discoverable(struct hci_dev *hdev) { /* Only queue if it would have any effect */ if (hdev_is_powered(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING) && hci_dev_test_flag(hdev, HCI_DISCOVERABLE) && hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) return hci_cmd_sync_queue(hdev, update_discoverable_sync, NULL, NULL); return 0; } int hci_update_connectable_sync(struct hci_dev *hdev) { int err; err = hci_update_scan_sync(hdev); if (err) return err; /* If BR/EDR is not enabled and we disable advertising as a * by-product of disabling connectable, we need to update the * advertising flags. */ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) err = hci_update_adv_data_sync(hdev, hdev->cur_adv_instance); /* Update the advertising parameters if necessary */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || !list_empty(&hdev->adv_instances)) { if (ext_adv_capable(hdev)) err = hci_start_ext_adv_sync(hdev, hdev->cur_adv_instance); else err = hci_enable_advertising_sync(hdev); if (err) return err; } return hci_update_passive_scan_sync(hdev); } int hci_inquiry_sync(struct hci_dev *hdev, u8 length, u8 num_rsp) { const u8 giac[3] = { 0x33, 0x8b, 0x9e }; const u8 liac[3] = { 0x00, 0x8b, 0x9e }; struct hci_cp_inquiry cp; bt_dev_dbg(hdev, ""); if (test_bit(HCI_INQUIRY, &hdev->flags)) return 0; hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); hci_dev_unlock(hdev); memset(&cp, 0, sizeof(cp)); if (hdev->discovery.limited) memcpy(&cp.lap, liac, sizeof(cp.lap)); else memcpy(&cp.lap, giac, sizeof(cp.lap)); cp.length = length; cp.num_rsp = num_rsp; return __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval) { u8 own_addr_type; /* Accept list is not used for discovery */ u8 filter_policy = 0x00; /* Default is to enable duplicates filter */ u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE; int err; bt_dev_dbg(hdev, ""); /* If controller is scanning, it means the passive scanning is * running. Thus, we should temporarily stop it in order to set the * discovery scanning parameters. */ err = hci_scan_disable_sync(hdev); if (err) { bt_dev_err(hdev, "Unable to disable scanning: %d", err); return err; } cancel_interleave_scan(hdev); /* Pause address resolution for active scan and stop advertising if * privacy is enabled. */ err = hci_pause_addr_resolution(hdev); if (err) goto failed; /* All active scans will be done with either a resolvable private * address (when privacy feature has been enabled) or non-resolvable * private address. */ err = hci_update_random_address_sync(hdev, true, scan_use_rpa(hdev), &own_addr_type); if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; if (hci_is_adv_monitoring(hdev) || (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) && hdev->discovery.result_filtering)) { /* Duplicate filter should be disabled when some advertisement * monitor is activated, otherwise AdvMon can only receive one * advertisement for one peer(*) during active scanning, and * might report loss to these peers. * * If controller does strict duplicate filtering and the * discovery requires result filtering disables controller based * filtering since that can cause reports that would match the * host filter to not be reported. */ filter_dup = LE_SCAN_FILTER_DUP_DISABLE; } err = hci_start_scan_sync(hdev, LE_SCAN_ACTIVE, interval, hdev->le_scan_window_discovery, own_addr_type, filter_policy, filter_dup); if (!err) return err; failed: /* Resume advertising if it was paused */ if (use_ll_privacy(hdev)) hci_resume_advertising_sync(hdev); /* Resume passive scanning */ hci_update_passive_scan_sync(hdev); return err; } static int hci_start_interleaved_discovery_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery * 2); if (err) return err; return hci_inquiry_sync(hdev, DISCOV_BREDR_INQUIRY_LEN, 0); } int hci_start_discovery_sync(struct hci_dev *hdev) { unsigned long timeout; int err; bt_dev_dbg(hdev, "type %u", hdev->discovery.type); switch (hdev->discovery.type) { case DISCOV_TYPE_BREDR: return hci_inquiry_sync(hdev, DISCOV_BREDR_INQUIRY_LEN, 0); case DISCOV_TYPE_INTERLEAVED: /* When running simultaneous discovery, the LE scanning time * should occupy the whole discovery time sine BR/EDR inquiry * and LE scanning are scheduled by the controller. * * For interleaving discovery in comparison, BR/EDR inquiry * and LE scanning are done sequentially with separate * timeouts. */ if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) { timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); /* During simultaneous discovery, we double LE scan * interval. We must leave some time for the controller * to do BR/EDR inquiry. */ err = hci_start_interleaved_discovery_sync(hdev); break; } timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout); err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery); break; case DISCOV_TYPE_LE: timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery); break; default: return -EINVAL; } if (err) return err; bt_dev_dbg(hdev, "timeout %u ms", jiffies_to_msecs(timeout)); queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_disable, timeout); return 0; } static void hci_suspend_monitor_sync(struct hci_dev *hdev) { switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_MSFT: msft_suspend_sync(hdev); break; default: return; } } /* This function disables discovery and mark it as paused */ static int hci_pause_discovery_sync(struct hci_dev *hdev) { int old_state = hdev->discovery.state; int err; /* If discovery already stopped/stopping/paused there nothing to do */ if (old_state == DISCOVERY_STOPPED || old_state == DISCOVERY_STOPPING || hdev->discovery_paused) return 0; hci_discovery_set_state(hdev, DISCOVERY_STOPPING); err = hci_stop_discovery_sync(hdev); if (err) return err; hdev->discovery_paused = true; hci_discovery_set_state(hdev, DISCOVERY_STOPPED); return 0; } static int hci_update_event_filter_sync(struct hci_dev *hdev) { struct bdaddr_list_with_flags *b; u8 scan = SCAN_DISABLED; bool scanning = test_bit(HCI_PSCAN, &hdev->flags); int err; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; /* Some fake CSR controllers lock up after setting this type of * filter, so avoid sending the request altogether. */ if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) return 0; /* Always clear event filter when starting */ hci_clear_event_filter_sync(hdev); list_for_each_entry(b, &hdev->accept_list, list) { if (!(b->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) continue; bt_dev_dbg(hdev, "Adding event filters for %pMR", &b->bdaddr); err = hci_set_event_filter_sync(hdev, HCI_FLT_CONN_SETUP, HCI_CONN_SETUP_ALLOW_BDADDR, &b->bdaddr, HCI_CONN_SETUP_AUTO_ON); if (err) bt_dev_dbg(hdev, "Failed to set event filter for %pMR", &b->bdaddr); else scan = SCAN_PAGE; } if (scan && !scanning) hci_write_scan_enable_sync(hdev, scan); else if (!scan && scanning) hci_write_scan_enable_sync(hdev, scan); return 0; } /* This function disables scan (BR and LE) and mark it as paused */ static int hci_pause_scan_sync(struct hci_dev *hdev) { if (hdev->scanning_paused) return 0; /* Disable page scan if enabled */ if (test_bit(HCI_PSCAN, &hdev->flags)) hci_write_scan_enable_sync(hdev, SCAN_DISABLED); hci_scan_disable_sync(hdev); hdev->scanning_paused = true; return 0; } /* This function performs the HCI suspend procedures in the follow order: * * Pause discovery (active scanning/inquiry) * Pause Directed Advertising/Advertising * Pause Scanning (passive scanning in case discovery was not active) * Disconnect all connections * Set suspend_status to BT_SUSPEND_DISCONNECT if hdev cannot wakeup * otherwise: * Update event mask (only set events that are allowed to wake up the host) * Update event filter (with devices marked with HCI_CONN_FLAG_REMOTE_WAKEUP) * Update passive scanning (lower duty cycle) * Set suspend_status to BT_SUSPEND_CONFIGURE_WAKE */ int hci_suspend_sync(struct hci_dev *hdev) { int err; /* If marked as suspended there nothing to do */ if (hdev->suspended) return 0; /* Mark device as suspended */ hdev->suspended = true; /* Pause discovery if not already stopped */ hci_pause_discovery_sync(hdev); /* Pause other advertisements */ hci_pause_advertising_sync(hdev); /* Suspend monitor filters */ hci_suspend_monitor_sync(hdev); /* Prevent disconnects from causing scanning to be re-enabled */ hci_pause_scan_sync(hdev); if (hci_conn_count(hdev)) { /* Soft disconnect everything (power off) */ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); if (err) { /* Set state to BT_RUNNING so resume doesn't notify */ hdev->suspend_state = BT_RUNNING; hci_resume_sync(hdev); return err; } /* Update event mask so only the allowed event can wakeup the * host. */ hci_set_event_mask_sync(hdev); } /* Only configure accept list if disconnect succeeded and wake * isn't being prevented. */ if (!hdev->wakeup || !hdev->wakeup(hdev)) { hdev->suspend_state = BT_SUSPEND_DISCONNECT; return 0; } /* Unpause to take care of updating scanning params */ hdev->scanning_paused = false; /* Enable event filter for paired devices */ hci_update_event_filter_sync(hdev); /* Update LE passive scan if enabled */ hci_update_passive_scan_sync(hdev); /* Pause scan changes again. */ hdev->scanning_paused = true; hdev->suspend_state = BT_SUSPEND_CONFIGURE_WAKE; return 0; } /* This function resumes discovery */ static int hci_resume_discovery_sync(struct hci_dev *hdev) { int err; /* If discovery not paused there nothing to do */ if (!hdev->discovery_paused) return 0; hdev->discovery_paused = false; hci_discovery_set_state(hdev, DISCOVERY_STARTING); err = hci_start_discovery_sync(hdev); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED : DISCOVERY_FINDING); return err; } static void hci_resume_monitor_sync(struct hci_dev *hdev) { switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_MSFT: msft_resume_sync(hdev); break; default: return; } } /* This function resume scan and reset paused flag */ static int hci_resume_scan_sync(struct hci_dev *hdev) { if (!hdev->scanning_paused) return 0; hdev->scanning_paused = false; hci_update_scan_sync(hdev); /* Reset passive scanning to normal */ hci_update_passive_scan_sync(hdev); return 0; } /* This function performs the HCI suspend procedures in the follow order: * * Restore event mask * Clear event filter * Update passive scanning (normal duty cycle) * Resume Directed Advertising/Advertising * Resume discovery (active scanning/inquiry) */ int hci_resume_sync(struct hci_dev *hdev) { /* If not marked as suspended there nothing to do */ if (!hdev->suspended) return 0; hdev->suspended = false; /* Restore event mask */ hci_set_event_mask_sync(hdev); /* Clear any event filters and restore scan state */ hci_clear_event_filter_sync(hdev); /* Resume scanning */ hci_resume_scan_sync(hdev); /* Resume monitor filters */ hci_resume_monitor_sync(hdev); /* Resume other advertisements */ hci_resume_advertising_sync(hdev); /* Resume discovery */ hci_resume_discovery_sync(hdev); return 0; } static bool conn_use_rpa(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; return hci_dev_test_flag(hdev, HCI_PRIVACY); } static int hci_le_ext_directed_advertising_sync(struct hci_dev *hdev, struct hci_conn *conn) { struct hci_cp_le_set_ext_adv_params cp; int err; bdaddr_t random_addr; u8 own_addr_type; err = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn), &own_addr_type); if (err) return err; /* Set require_privacy to false so that the remote device has a * chance of identifying us. */ err = hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL, &own_addr_type, &random_addr); if (err) return err; memset(&cp, 0, sizeof(cp)); cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND); cp.channel_map = hdev->le_adv_channel_map; cp.tx_power = HCI_TX_POWER_INVALID; cp.primary_phy = HCI_ADV_PHY_1M; cp.secondary_phy = HCI_ADV_PHY_1M; cp.handle = 0x00; /* Use instance 0 for directed adv */ cp.own_addr_type = own_addr_type; cp.peer_addr_type = conn->dst_type; bacpy(&cp.peer_addr, &conn->dst); /* As per Core Spec 5.2 Vol 2, PART E, Sec 7.8.53, for * advertising_event_property LE_LEGACY_ADV_DIRECT_IND * does not supports advertising data when the advertising set already * contains some, the controller shall return erroc code 'Invalid * HCI Command Parameters(0x12). * So it is required to remove adv set for handle 0x00. since we use * instance 0 for directed adv. */ err = hci_remove_ext_adv_instance_sync(hdev, cp.handle, NULL); if (err) return err; err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) return err; /* Check if random address need to be updated */ if (own_addr_type == ADDR_LE_DEV_RANDOM && bacmp(&random_addr, BDADDR_ANY) && bacmp(&random_addr, &hdev->random_addr)) { err = hci_set_adv_set_random_addr_sync(hdev, 0x00, &random_addr); if (err) return err; } return hci_enable_ext_advertising_sync(hdev, 0x00); } static int hci_le_directed_advertising_sync(struct hci_dev *hdev, struct hci_conn *conn) { struct hci_cp_le_set_adv_param cp; u8 status; u8 own_addr_type; u8 enable; if (ext_adv_capable(hdev)) return hci_le_ext_directed_advertising_sync(hdev, conn); /* Clear the HCI_LE_ADV bit temporarily so that the * hci_update_random_address knows that it's safe to go ahead * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ hci_dev_clear_flag(hdev, HCI_LE_ADV); /* Set require_privacy to false so that the remote device has a * chance of identifying us. */ status = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn), &own_addr_type); if (status) return status; memset(&cp, 0, sizeof(cp)); /* Some controllers might reject command if intervals are not * within range for undirected advertising. * BCM20702A0 is known to be affected by this. */ cp.min_interval = cpu_to_le16(0x0020); cp.max_interval = cpu_to_le16(0x0020); cp.type = LE_ADV_DIRECT_IND; cp.own_address_type = own_addr_type; cp.direct_addr_type = conn->dst_type; bacpy(&cp.direct_addr, &conn->dst); cp.channel_map = hdev->le_adv_channel_map; status = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (status) return status; enable = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); } static void set_ext_conn_params(struct hci_conn *conn, struct hci_cp_le_ext_conn_param *p) { struct hci_dev *hdev = conn->hdev; memset(p, 0, sizeof(*p)); p->scan_interval = cpu_to_le16(hdev->le_scan_int_connect); p->scan_window = cpu_to_le16(hdev->le_scan_window_connect); p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); p->conn_latency = cpu_to_le16(conn->le_conn_latency); p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout); p->min_ce_len = cpu_to_le16(0x0000); p->max_ce_len = cpu_to_le16(0x0000); } static int hci_le_ext_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 own_addr_type) { struct hci_cp_le_ext_create_conn *cp; struct hci_cp_le_ext_conn_param *p; u8 data[sizeof(*cp) + sizeof(*p) * 3]; u32 plen; cp = (void *)data; p = (void *)cp->data; memset(cp, 0, sizeof(*cp)); bacpy(&cp->peer_addr, &conn->dst); cp->peer_addr_type = conn->dst_type; cp->own_addr_type = own_addr_type; plen = sizeof(*cp); if (scan_1m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_1M || conn->le_adv_sec_phy == HCI_ADV_PHY_1M)) { cp->phys |= LE_SCAN_PHY_1M; set_ext_conn_params(conn, p); p++; plen += sizeof(*p); } if (scan_2m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_2M || conn->le_adv_sec_phy == HCI_ADV_PHY_2M)) { cp->phys |= LE_SCAN_PHY_2M; set_ext_conn_params(conn, p); p++; plen += sizeof(*p); } if (scan_coded(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_CODED || conn->le_adv_sec_phy == HCI_ADV_PHY_CODED)) { cp->phys |= LE_SCAN_PHY_CODED; set_ext_conn_params(conn, p); plen += sizeof(*p); } return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_EXT_CREATE_CONN, plen, data, HCI_EV_LE_ENHANCED_CONN_COMPLETE, conn->conn_timeout, NULL); } static int hci_le_create_conn_sync(struct hci_dev *hdev, void *data) { struct hci_cp_le_create_conn cp; struct hci_conn_params *params; u8 own_addr_type; int err; struct hci_conn *conn = data; if (!hci_conn_valid(hdev, conn)) return -ECANCELED; bt_dev_dbg(hdev, "conn %p", conn); clear_bit(HCI_CONN_SCANNING, &conn->flags); conn->state = BT_CONNECT; /* If requested to connect as peripheral use directed advertising */ if (conn->role == HCI_ROLE_SLAVE) { /* If we're active scanning and simultaneous roles is not * enabled simply reject the attempt. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->le_scan_type == LE_SCAN_ACTIVE && !hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES)) { hci_conn_del(conn); return -EBUSY; } /* Pause advertising while doing directed advertising. */ hci_pause_advertising_sync(hdev); err = hci_le_directed_advertising_sync(hdev, conn); goto done; } /* Disable advertising if simultaneous roles is not in use. */ if (!hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES)) hci_pause_advertising_sync(hdev); params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); if (params) { conn->le_conn_min_interval = params->conn_min_interval; conn->le_conn_max_interval = params->conn_max_interval; conn->le_conn_latency = params->conn_latency; conn->le_supv_timeout = params->supervision_timeout; } else { conn->le_conn_min_interval = hdev->le_conn_min_interval; conn->le_conn_max_interval = hdev->le_conn_max_interval; conn->le_conn_latency = hdev->le_conn_latency; conn->le_supv_timeout = hdev->le_supv_timeout; } /* If controller is scanning, we stop it since some controllers are * not able to scan and connect at the same time. Also set the * HCI_LE_SCAN_INTERRUPTED flag so that the command complete * handler for scan disabling knows to set the correct discovery * state. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { hci_scan_disable_sync(hdev); hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); } /* Update random address, but set require_privacy to false so * that we never connect with an non-resolvable address. */ err = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn), &own_addr_type); if (err) goto done; /* Send command LE Extended Create Connection if supported */ if (use_ext_conn(hdev)) { err = hci_le_ext_create_conn_sync(hdev, conn, own_addr_type); goto done; } memset(&cp, 0, sizeof(cp)); cp.scan_interval = cpu_to_le16(hdev->le_scan_int_connect); cp.scan_window = cpu_to_le16(hdev->le_scan_window_connect); bacpy(&cp.peer_addr, &conn->dst); cp.peer_addr_type = conn->dst_type; cp.own_address_type = own_addr_type; cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); cp.conn_latency = cpu_to_le16(conn->le_conn_latency); cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2261: * * If this event is unmasked and the HCI_LE_Connection_Complete event * is unmasked, only the HCI_LE_Enhanced_Connection_Complete event is * sent when a new connection has been created. */ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp, use_enhanced_conn_complete(hdev) ? HCI_EV_LE_ENHANCED_CONN_COMPLETE : HCI_EV_LE_CONN_COMPLETE, conn->conn_timeout, NULL); done: if (err == -ETIMEDOUT) hci_le_connect_cancel_sync(hdev, conn, 0x00); /* Re-enable advertising after the connection attempt is finished. */ hci_resume_advertising_sync(hdev); return err; } int hci_le_create_cis_sync(struct hci_dev *hdev) { DEFINE_FLEX(struct hci_cp_le_create_cis, cmd, cis, num_cis, 0x1f); size_t aux_num_cis = 0; struct hci_conn *conn; u8 cig = BT_ISO_QOS_CIG_UNSET; /* The spec allows only one pending LE Create CIS command at a time. If * the command is pending now, don't do anything. We check for pending * connections after each CIS Established event. * * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 2566: * * If the Host issues this command before all the * HCI_LE_CIS_Established events from the previous use of the * command have been generated, the Controller shall return the * error code Command Disallowed (0x0C). * * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 2567: * * When the Controller receives the HCI_LE_Create_CIS command, the * Controller sends the HCI_Command_Status event to the Host. An * HCI_LE_CIS_Established event will be generated for each CIS when it * is established or if it is disconnected or considered lost before * being established; until all the events are generated, the command * remains pending. */ hci_dev_lock(hdev); rcu_read_lock(); /* Wait until previous Create CIS has completed */ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) goto done; } /* Find CIG with all CIS ready */ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { struct hci_conn *link; if (hci_conn_check_create_cis(conn)) continue; cig = conn->iso_qos.ucast.cig; list_for_each_entry_rcu(link, &hdev->conn_hash.list, list) { if (hci_conn_check_create_cis(link) > 0 && link->iso_qos.ucast.cig == cig && link->state != BT_CONNECTED) { cig = BT_ISO_QOS_CIG_UNSET; break; } } if (cig != BT_ISO_QOS_CIG_UNSET) break; } if (cig == BT_ISO_QOS_CIG_UNSET) goto done; list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { struct hci_cis *cis = &cmd->cis[aux_num_cis]; if (hci_conn_check_create_cis(conn) || conn->iso_qos.ucast.cig != cig) continue; set_bit(HCI_CONN_CREATE_CIS, &conn->flags); cis->acl_handle = cpu_to_le16(conn->parent->handle); cis->cis_handle = cpu_to_le16(conn->handle); aux_num_cis++; if (aux_num_cis >= cmd->num_cis) break; } cmd->num_cis = aux_num_cis; done: rcu_read_unlock(); hci_dev_unlock(hdev); if (!aux_num_cis) return 0; /* Wait for HCI_LE_CIS_Established */ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS, struct_size(cmd, cis, cmd->num_cis), cmd, HCI_EVT_LE_CIS_ESTABLISHED, conn->conn_timeout, NULL); } int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle) { struct hci_cp_le_remove_cig cp; memset(&cp, 0, sizeof(cp)); cp.cig_id = handle; return __hci_cmd_sync_status(hdev, HCI_OP_LE_REMOVE_CIG, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle) { struct hci_cp_le_big_term_sync cp; memset(&cp, 0, sizeof(cp)); cp.handle = handle; return __hci_cmd_sync_status(hdev, HCI_OP_LE_BIG_TERM_SYNC, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle) { struct hci_cp_le_pa_term_sync cp; memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(handle); return __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, bool use_rpa, struct adv_info *adv_instance, u8 *own_addr_type, bdaddr_t *rand_addr) { int err; bacpy(rand_addr, BDADDR_ANY); /* If privacy is enabled use a resolvable private address. If * current RPA has expired then generate a new one. */ if (use_rpa) { /* If Controller supports LL Privacy use own address type is * 0x03 */ if (use_ll_privacy(hdev)) *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; else *own_addr_type = ADDR_LE_DEV_RANDOM; if (adv_instance) { if (adv_rpa_valid(adv_instance)) return 0; } else { if (rpa_valid(hdev)) return 0; } err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { bt_dev_err(hdev, "failed to generate new RPA"); return err; } bacpy(rand_addr, &hdev->rpa); return 0; } /* In case of required privacy without resolvable private address, * use an non-resolvable private address. This is useful for * non-connectable advertising. */ if (require_privacy) { bdaddr_t nrpa; while (true) { /* The non-resolvable private address is generated * from random six bytes with the two most significant * bits cleared. */ get_random_bytes(&nrpa, 6); nrpa.b[5] &= 0x3f; /* The non-resolvable private address shall not be * equal to the public address. */ if (bacmp(&hdev->bdaddr, &nrpa)) break; } *own_addr_type = ADDR_LE_DEV_RANDOM; bacpy(rand_addr, &nrpa); return 0; } /* No privacy so use a public address. */ *own_addr_type = ADDR_LE_DEV_PUBLIC; return 0; } static int _update_adv_data_sync(struct hci_dev *hdev, void *data) { u8 instance = PTR_UINT(data); return hci_update_adv_data_sync(hdev, instance); } int hci_update_adv_data(struct hci_dev *hdev, u8 instance) { return hci_cmd_sync_queue(hdev, _update_adv_data_sync, UINT_PTR(instance), NULL); } static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; struct inquiry_entry *ie; struct hci_cp_create_conn cp; int err; if (!hci_conn_valid(hdev, conn)) return -ECANCELED; /* Many controllers disallow HCI Create Connection while it is doing * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create * Connection. This may cause the MGMT discovering state to become false * without user space's request but it is okay since the MGMT Discovery * APIs do not promise that discovery should be done forever. Instead, * the user space monitors the status of MGMT discovering and it may * request for discovery again when this flag becomes false. */ if (test_bit(HCI_INQUIRY, &hdev->flags)) { err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); if (err) bt_dev_warn(hdev, "Failed to cancel inquiry %d", err); } conn->state = BT_CONNECT; conn->out = true; conn->role = HCI_ROLE_MASTER; conn->attempt++; conn->link_policy = hdev->link_policy; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.pscan_rep_mode = 0x02; ie = hci_inquiry_cache_lookup(hdev, &conn->dst); if (ie) { if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) { cp.pscan_rep_mode = ie->data.pscan_rep_mode; cp.pscan_mode = ie->data.pscan_mode; cp.clock_offset = ie->data.clock_offset | cpu_to_le16(0x8000); } memcpy(conn->dev_class, ie->data.dev_class, 3); } cp.pkt_type = cpu_to_le16(conn->pkt_type); if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER)) cp.role_switch = 0x01; else cp.role_switch = 0x00; return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp, HCI_EV_CONN_COMPLETE, conn->conn_timeout, NULL); } int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn) { return hci_cmd_sync_queue_once(hdev, hci_acl_create_conn_sync, conn, NULL); } static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; bt_dev_dbg(hdev, "err %d", err); if (err == -ECANCELED) return; hci_dev_lock(hdev); if (!hci_conn_valid(hdev, conn)) goto done; if (!err) { hci_connect_le_scan_cleanup(conn, 0x00); goto done; } /* Check if connection is still pending */ if (conn != hci_lookup_le_connect(hdev)) goto done; /* Flush to make sure we send create conn cancel command if needed */ flush_delayed_work(&conn->le_conn_timeout); hci_conn_failed(conn, bt_status(err)); done: hci_dev_unlock(hdev); } int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn) { return hci_cmd_sync_queue_once(hdev, hci_le_create_conn_sync, conn, create_le_conn_complete); } int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn) { if (conn->state != BT_OPEN) return -EINVAL; switch (conn->type) { case ACL_LINK: return !hci_cmd_sync_dequeue_once(hdev, hci_acl_create_conn_sync, conn, NULL); case LE_LINK: return !hci_cmd_sync_dequeue_once(hdev, hci_le_create_conn_sync, conn, create_le_conn_complete); } return -ENOENT; } int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, struct hci_conn_params *params) { struct hci_cp_le_conn_update cp; memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.conn_interval_min = cpu_to_le16(params->conn_min_interval); cp.conn_interval_max = cpu_to_le16(params->conn_max_interval); cp.conn_latency = cpu_to_le16(params->conn_latency); cp.supervision_timeout = cpu_to_le16(params->supervision_timeout); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); return __hci_cmd_sync_status(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); }
15 11102 11044 256 747 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 /* SPDX-License-Identifier: GPL-2.0-only */ /* * kref.h - library routines for handling generic reference counted objects * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Corp. * * based on kobject.h which was: * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org> * Copyright (C) 2002-2003 Open Source Development Labs */ #ifndef _KREF_H_ #define _KREF_H_ #include <linux/spinlock.h> #include <linux/refcount.h> struct kref { refcount_t refcount; }; #define KREF_INIT(n) { .refcount = REFCOUNT_INIT(n), } /** * kref_init - initialize object. * @kref: object in question. */ static inline void kref_init(struct kref *kref) { refcount_set(&kref->refcount, 1); } static inline unsigned int kref_read(const struct kref *kref) { return refcount_read(&kref->refcount); } /** * kref_get - increment refcount for object. * @kref: object. */ static inline void kref_get(struct kref *kref) { refcount_inc(&kref->refcount); } /** * kref_put - decrement refcount for object. * @kref: object. * @release: pointer to the function that will clean up the object when the * last reference to the object is released. * This pointer is required, and it is not acceptable to pass kfree * in as this function. * * Decrement the refcount, and if 0, call release(). * Return 1 if the object was removed, otherwise return 0. Beware, if this * function returns 0, you still can not count on the kref from remaining in * memory. Only use the return value if you want to see if the kref is now * gone, not present. */ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) { if (refcount_dec_and_test(&kref->refcount)) { release(kref); return 1; } return 0; } static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), struct mutex *lock) { if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) { release(kref); return 1; } return 0; } static inline int kref_put_lock(struct kref *kref, void (*release)(struct kref *kref), spinlock_t *lock) { if (refcount_dec_and_lock(&kref->refcount, lock)) { release(kref); return 1; } return 0; } /** * kref_get_unless_zero - Increment refcount for object unless it is zero. * @kref: object. * * Return non-zero if the increment succeeded. Otherwise return 0. * * This function is intended to simplify locking around refcounting for * objects that can be looked up from a lookup structure, and which are * removed from that lookup structure in the object destructor. * Operations on such objects require at least a read lock around * lookup + kref_get, and a write lock around kref_put + remove from lookup * structure. Furthermore, RCU implementations become extremely tricky. * With a lookup followed by a kref_get_unless_zero *with return value check* * locking in the kref_put path can be deferred to the actual removal from * the lookup structure and RCU lookups become trivial. */ static inline int __must_check kref_get_unless_zero(struct kref *kref) { return refcount_inc_not_zero(&kref->refcount); } #endif /* _KREF_H_ */
14102 12 12 12 12 12 12 12 12 12 12 12 12 4192 4190 4188 12 11 4196 4187 4189 4196 4186 4196 4186 4196 4802 4811 516 4804 4811 130 4800 4811 4811 4811 4800 4801 12 4811 12 12 12 12 107 4761 3071 3063 4725 4191 3194 3194 4619 4620 4620 4757 4729 4760 4763 78 4721 4612 4762 4767 162 528 4235 4186 528 3185 4761 4768 4767 160 162 162 162 3194 3194 3193 82 3155 3160 3154 528 3058 34 161 528 3185 3060 161 3194 3194 3194 3140 958 1 3138 4058 46 50 4727 4725 3152 4058 4760 4767 3186 4056 3187 3194 4766 4762 4746 4192 4191 4190 4196 4184 185 4338 4572 4345 4187 4196 4186 4759 4758 4756 4724 65 1 259 260 4187 4188 2 4186 62 4179 6 6 65 65 64 12 4378 4715 19 4729 408 388 19 408 12 411 29 4724 4736 4444 4433 4727 4731 4729 4750 17 4730 12 4736 4730 4360 3995 4129 9 32 32 32 29 31 32 32 32 32 32 32 32 32 32 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1993 Linus Torvalds * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 * Numa awareness, Christoph Lameter, SGI, June 2005 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 */ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/set_memory.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/rbtree.h> #include <linux/xarray.h> #include <linux/io.h> #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/memcontrol.h> #include <linux/llist.h> #include <linux/uio.h> #include <linux/bitops.h> #include <linux/rbtree_augmented.h> #include <linux/overflow.h> #include <linux/pgtable.h> #include <linux/hugetlb.h> #include <linux/sched/mm.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> #include <linux/page_owner.h> #define CREATE_TRACE_POINTS #include <trace/events/vmalloc.h> #include "internal.h" #include "pgalloc-track.h" #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; static int __init set_nohugeiomap(char *str) { ioremap_max_page_shift = PAGE_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC static bool __ro_after_init vmap_allow_huge = true; static int __init set_nohugevmalloc(char *str) { vmap_allow_huge = false; return 0; } early_param("nohugevmalloc", set_nohugevmalloc); #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ static const bool vmap_allow_huge = false; #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } EXPORT_SYMBOL(is_vmalloc_addr); struct vfree_deferred { struct llist_head list; struct work_struct wq; }; static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pte_t *pte; u64 pfn; struct page *page; unsigned long size = PAGE_SIZE; pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { if (unlikely(!pte_none(ptep_get(pte)))) { if (pfn_valid(pfn)) { page = pfn_to_page(pfn); dump_page(page, "remapping already mapped page"); } BUG(); } #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); if (size != PAGE_SIZE) { pte_t entry = pfn_pte(pfn, prot); entry = arch_make_huge_pte(entry, ilog2(size), 0); set_huge_pte_at(&init_mm, addr, pte, entry, size); pfn += PFN_DOWN(size); continue; } #endif set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < PMD_SHIFT) return 0; if (!arch_vmap_pmd_supported(prot)) return 0; if ((end - addr) != PMD_SIZE) return 0; if (!IS_ALIGNED(addr, PMD_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, PMD_SIZE)) return 0; if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) return 0; return pmd_set_huge(pmd, phys_addr, prot); } static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_PMD_MODIFIED; continue; } if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < PUD_SHIFT) return 0; if (!arch_vmap_pud_supported(prot)) return 0; if ((end - addr) != PUD_SIZE) return 0; if (!IS_ALIGNED(addr, PUD_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, PUD_SIZE)) return 0; if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) return 0; return pud_set_huge(pud, phys_addr, prot); } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_PUD_MODIFIED; continue; } if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < P4D_SHIFT) return 0; if (!arch_vmap_p4d_supported(prot)) return 0; if ((end - addr) != P4D_SIZE) return 0; if (!IS_ALIGNED(addr, P4D_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, P4D_SIZE)) return 0; if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) return 0; return p4d_set_huge(p4d, phys_addr, prot); } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_P4D_MODIFIED; continue; } if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_range_noflush(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { pgd_t *pgd; unsigned long start; unsigned long next; int err; pgtbl_mod_mask mask = 0; might_sleep(); BUG_ON(addr >= end); start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return err; } int vmap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { int err; err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), ioremap_max_page_shift); flush_cache_vmap(addr, end); if (!err) err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, ioremap_max_page_shift); return err; } int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { struct vm_struct *area; area = find_vm_area((void *)addr); if (!area || !(area->flags & VM_IOREMAP)) { WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr); return -EINVAL; } if (addr != (unsigned long)area->addr || (void *)end != area->addr + get_vm_area_size(area)) { WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n", addr, end, (long)area->addr, (long)area->addr + get_vm_area_size(area)); return -ERANGE; } return vmap_page_range(addr, end, phys_addr, prot); } static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pte_t *pte; pte = pte_offset_kernel(pmd, addr); do { pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; } static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); cleared = pmd_clear_huge(pmd); if (cleared || pmd_bad(*pmd)) *mask |= PGTBL_PMD_MODIFIED; if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next, mask); cond_resched(); } while (pmd++, addr = next, addr != end); } static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); cleared = pud_clear_huge(pud); if (cleared || pud_bad(*pud)) *mask |= PGTBL_PUD_MODIFIED; if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); p4d_clear_huge(p4d); if (p4d_bad(*p4d)) *mask |= PGTBL_P4D_MODIFIED; if (p4d_none_or_clear_bad(p4d)) continue; vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } /* * vunmap_range_noflush is similar to vunmap_range, but does not * flush caches or TLBs. * * The caller is responsible for calling flush_cache_vmap() before calling * this function, and flush_tlb_kernel_range after it has returned * successfully (and before the addresses are expected to cause a page fault * or be re-mapped for something else, if TLB flushes are being delayed or * coalesced). * * This is an internal function only. Do not use outside mm/. */ void __vunmap_range_noflush(unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; unsigned long addr = start; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); } void vunmap_range_noflush(unsigned long start, unsigned long end) { kmsan_vunmap_range_noflush(start, end); __vunmap_range_noflush(start, end); } /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap * @end: end of the VM area to unmap (non-inclusive) * * Clears any present PTEs in the virtual address range, flushes TLBs and * caches. Any subsequent access to the address before it has been re-mapped * is a kernel bug. */ void vunmap_range(unsigned long addr, unsigned long end) { flush_cache_vunmap(addr, end); vunmap_range_noflush(addr, end); flush_tlb_kernel_range(addr, end); } static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pte_t *pte; /* * nr is a running index into the array which helps higher level * callers keep track of where we're up to. */ pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { struct page *page = pages[*nr]; if (WARN_ON(!pte_none(ptep_get(pte)))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; if (WARN_ON(!pfn_valid(page_to_pfn(page)))) return -EINVAL; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages) { unsigned long start = addr; pgd_t *pgd; unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return 0; } /* * vmap_pages_range_noflush is similar to vmap_pages_range, but does not * flush caches. * * The caller is responsible for calling flush_cache_vmap() after this * function returns successfully and before the addresses are accessed. * * This is an internal function only. Do not use outside mm/. */ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; WARN_ON(page_shift < PAGE_SHIFT); if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || page_shift == PAGE_SHIFT) return vmap_small_pages_range_noflush(addr, end, prot, pages); for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; err = vmap_range_noflush(addr, addr + (1UL << page_shift), page_to_phys(pages[i]), prot, page_shift); if (err) return err; addr += 1UL << page_shift; } return 0; } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map * @end: end of the VM area to map (non-inclusive) * @prot: page protection flags to use * @pages: pages to map (always PAGE_SIZE pages) * @page_shift: maximum shift that the pages may be mapped with, @pages must * be aligned and contiguous up to at least this shift. * * RETURNS: * 0 on success, -errno on failure. */ int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); flush_cache_vmap(addr, end); return err; } static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, unsigned long end) { might_sleep(); if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS)) return -EINVAL; if (WARN_ON_ONCE(area->flags & VM_NO_GUARD)) return -EINVAL; if (WARN_ON_ONCE(!(area->flags & VM_SPARSE))) return -EINVAL; if ((end - start) >> PAGE_SHIFT > totalram_pages()) return -E2BIG; if (start < (unsigned long)area->addr || (void *)end > area->addr + get_vm_area_size(area)) return -ERANGE; return 0; } /** * vm_area_map_pages - map pages inside given sparse vm_area * @area: vm_area * @start: start address inside vm_area * @end: end address inside vm_area * @pages: pages to map (always PAGE_SIZE pages) */ int vm_area_map_pages(struct vm_struct *area, unsigned long start, unsigned long end, struct page **pages) { int err; err = check_sparse_vm_area(area, start, end); if (err) return err; return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT); } /** * vm_area_unmap_pages - unmap pages inside given sparse vm_area * @area: vm_area * @start: start address inside vm_area * @end: end address inside vm_area */ void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, unsigned long end) { if (check_sparse_vm_area(area, start, end)) return; vunmap_range(start, end); } int is_vmalloc_or_module_addr(const void *x) { /* * ARM, x86-64 and sparc64 put modules in a special place, * and fall back on vmalloc() if that fails. Others * just put it in the vmalloc space. */ #if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR) unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif return is_vmalloc_addr(x); } EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); /* * Walk a vmap address to the struct page it maps. Huge vmap mappings will * return the tail page that corresponds to the base page address, which * matches small vmap mappings. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); if (pgd_none(*pgd)) return NULL; if (WARN_ON_ONCE(pgd_leaf(*pgd))) return NULL; /* XXX: no allowance for huge pgd */ if (WARN_ON_ONCE(pgd_bad(*pgd))) return NULL; p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; if (p4d_leaf(*p4d)) return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(p4d_bad(*p4d))) return NULL; pud = pud_offset(p4d, addr); if (pud_none(*pud)) return NULL; if (pud_leaf(*pud)) return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pud_bad(*pud))) return NULL; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; if (pmd_leaf(*pmd)) return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; ptep = pte_offset_kernel(pmd, addr); pte = ptep_get(ptep); if (pte_present(pte)) page = pte_page(pte); return page; } EXPORT_SYMBOL(vmalloc_to_page); /* * Map a vmalloc()-space virtual address to the physical page frame number. */ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) { return page_to_pfn(vmalloc_to_page(vmalloc_addr)); } EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 static DEFINE_SPINLOCK(free_vmap_area_lock); static bool vmap_initialized __read_mostly; /* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to * make things faster. Especially in "no edge" splitting of * free block. */ static struct kmem_cache *vmap_area_cachep; /* * This linked list is used in pair with free_vmap_area_root. * It gives O(1) access to prev/next to perform fast coalescing. */ static LIST_HEAD(free_vmap_area_list); /* * This augment red-black tree represents the free vmap space. * All vmap_area objects in this tree are sorted by va->va_start * address. It is used for allocation and merging when a vmap * object is released. * * Each vmap_area node contains a maximum available free block * of its sub-tree, right or left. Therefore it is possible to * find a lowest match of free area. */ static struct rb_root free_vmap_area_root = RB_ROOT; /* * Preload a CPU with one object for "no edge" split case. The * aim is to get rid of allocations from the atomic context, thus * to use more permissive allocation masks. */ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); /* * This structure defines a single, solid model where a list and * rb-tree are part of one entity protected by the lock. Nodes are * sorted in ascending order, thus for O(1) access to left/right * neighbors a list is used as well as for sequential traversal. */ struct rb_list { struct rb_root root; struct list_head head; spinlock_t lock; }; /* * A fast size storage contains VAs up to 1M size. A pool consists * of linked between each other ready to go VAs of certain sizes. * An index in the pool-array corresponds to number of pages + 1. */ #define MAX_VA_SIZE_PAGES 256 struct vmap_pool { struct list_head head; unsigned long len; }; /* * An effective vmap-node logic. Users make use of nodes instead * of a global heap. It allows to balance an access and mitigate * contention. */ static struct vmap_node { /* Simple size segregated storage. */ struct vmap_pool pool[MAX_VA_SIZE_PAGES]; spinlock_t pool_lock; bool skip_populate; /* Bookkeeping data of this node. */ struct rb_list busy; struct rb_list lazy; /* * Ready-to-free areas. */ struct list_head purge_list; struct work_struct purge_work; unsigned long nr_purged; } single; /* * Initial setup consists of one single node, i.e. a balancing * is fully disabled. Later on, after vmap is initialized these * parameters are updated based on a system capacity. */ static struct vmap_node *vmap_nodes = &single; static __read_mostly unsigned int nr_vmap_nodes = 1; static __read_mostly unsigned int vmap_zone_size = 1; static inline unsigned int addr_to_node_id(unsigned long addr) { return (addr / vmap_zone_size) % nr_vmap_nodes; } static inline struct vmap_node * addr_to_node(unsigned long addr) { return &vmap_nodes[addr_to_node_id(addr)]; } static inline struct vmap_node * id_to_node(unsigned int id) { return &vmap_nodes[id % nr_vmap_nodes]; } /* * We use the value 0 to represent "no node", that is why * an encoded value will be the node-id incremented by 1. * It is always greater then 0. A valid node_id which can * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id * is not valid 0 is returned. */ static unsigned int encode_vn_id(unsigned int node_id) { /* Can store U8_MAX [0:254] nodes. */ if (node_id < nr_vmap_nodes) return (node_id + 1) << BITS_PER_BYTE; /* Warn and no node encoded. */ WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id); return 0; } /* * Returns an encoded node-id, the valid range is within * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is * returned if extracted data is wrong. */ static unsigned int decode_vn_id(unsigned int val) { unsigned int node_id = (val >> BITS_PER_BYTE) - 1; /* Can store U8_MAX [0:254] nodes. */ if (node_id < nr_vmap_nodes) return node_id; /* If it was _not_ zero, warn. */ WARN_ONCE(node_id != UINT_MAX, "Decode wrong node id (%d)\n", node_id); return nr_vmap_nodes; } static bool is_vn_id_valid(unsigned int node_id) { if (node_id < nr_vmap_nodes) return true; return false; } static __always_inline unsigned long va_size(struct vmap_area *va) { return (va->va_end - va->va_start); } static __always_inline unsigned long get_subtree_max_size(struct rb_node *node) { struct vmap_area *va; va = rb_entry_safe(node, struct vmap_area, rb_node); return va ? va->subtree_max_size : 0; } RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void reclaim_and_purge_vmap_areas(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); static atomic_long_t nr_vmalloc_pages; unsigned long vmalloc_nr_pages(void) { return atomic_long_read(&nr_vmalloc_pages); } static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); while (n) { struct vmap_area *va; va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; else if (addr >= va->va_end) n = n->rb_right; else return va; } return NULL; } /* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area * __find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) { struct vmap_area *va = NULL; struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); while (n) { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end > addr) { va = tmp; if (tmp->va_start <= addr) break; n = n->rb_left; } else n = n->rb_right; } return va; } /* * Returns a node where a first VA, that satisfies addr < va_end, resides. * If success, a node is locked. A user is responsible to unlock it when a * VA is no longer needed to be accessed. * * Returns NULL if nothing found. */ static struct vmap_node * find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { unsigned long va_start_lowest; struct vmap_node *vn; int i; repeat: for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); if (*va) if (!va_start_lowest || (*va)->va_start < va_start_lowest) va_start_lowest = (*va)->va_start; spin_unlock(&vn->busy.lock); } /* * Check if found VA exists, it might have gone away. In this case we * repeat the search because a VA has been removed concurrently and we * need to proceed to the next one, which is a rare case. */ if (va_start_lowest) { vn = addr_to_node(va_start_lowest); spin_lock(&vn->busy.lock); *va = __find_vmap_area(va_start_lowest, &vn->busy.root); if (*va) return vn; spin_unlock(&vn->busy.lock); goto repeat; } return NULL; } /* * This function returns back addresses of parent node * and its left or right link for further processing. * * Otherwise NULL is returned. In that case all further * steps regarding inserting of conflicting overlap range * have to be declined and actually considered as a bug. */ static __always_inline struct rb_node ** find_va_links(struct vmap_area *va, struct rb_root *root, struct rb_node *from, struct rb_node **parent) { struct vmap_area *tmp_va; struct rb_node **link; if (root) { link = &root->rb_node; if (unlikely(!*link)) { *parent = NULL; return link; } } else { link = &from; } /* * Go to the bottom of the tree. When we hit the last point * we end up with parent rb_node and correct direction, i name * it link, where the new va->rb_node will be attached to. */ do { tmp_va = rb_entry(*link, struct vmap_area, rb_node); /* * During the traversal we also do some sanity check. * Trigger the BUG() if there are sides(left/right) * or full overlaps. */ if (va->va_end <= tmp_va->va_start) link = &(*link)->rb_left; else if (va->va_start >= tmp_va->va_end) link = &(*link)->rb_right; else { WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); return NULL; } } while (*link); *parent = &tmp_va->rb_node; return link; } static __always_inline struct list_head * get_va_next_sibling(struct rb_node *parent, struct rb_node **link) { struct list_head *list; if (unlikely(!parent)) /* * The red-black tree where we try to find VA neighbors * before merging or inserting is empty, i.e. it means * there is no free vmap space. Normally it does not * happen but we handle this case anyway. */ return NULL; list = &rb_entry(parent, struct vmap_area, rb_node)->list; return (&parent->rb_right == link ? list->next : list); } static __always_inline void __link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head, bool augment) { /* * VA is still not in the list, but we can * identify its future previous list_head node. */ if (likely(parent)) { head = &rb_entry(parent, struct vmap_area, rb_node)->list; if (&parent->rb_right != link) head = head->prev; } /* Insert to the rb-tree */ rb_link_node(&va->rb_node, parent, link); if (augment) { /* * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to * its current size before calling rb_insert_augmented(). * It is because we populate the tree from the bottom * to parent levels when the node _is_ in the tree. * * Therefore we set subtree_max_size to zero after insertion, * to let __augment_tree_propagate_from() puts everything to * the correct order later on. */ rb_insert_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); va->subtree_max_size = 0; } else { rb_insert_color(&va->rb_node, root); } /* Address-sort this list */ list_add(&va->list, head); } static __always_inline void link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head) { __link_va(va, root, parent, link, head, false); } static __always_inline void link_va_augment(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head) { __link_va(va, root, parent, link, head, true); } static __always_inline void __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) { if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) return; if (augment) rb_erase_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); else rb_erase(&va->rb_node, root); list_del_init(&va->list); RB_CLEAR_NODE(&va->rb_node); } static __always_inline void unlink_va(struct vmap_area *va, struct rb_root *root) { __unlink_va(va, root, false); } static __always_inline void unlink_va_augment(struct vmap_area *va, struct rb_root *root) { __unlink_va(va, root, true); } #if DEBUG_AUGMENT_PROPAGATE_CHECK /* * Gets called when remove the node and rotate. */ static __always_inline unsigned long compute_subtree_max_size(struct vmap_area *va) { return max3(va_size(va), get_subtree_max_size(va->rb_node.rb_left), get_subtree_max_size(va->rb_node.rb_right)); } static void augment_tree_propagate_check(void) { struct vmap_area *va; unsigned long computed_size; list_for_each_entry(va, &free_vmap_area_list, list) { computed_size = compute_subtree_max_size(va); if (computed_size != va->subtree_max_size) pr_emerg("tree is corrupted: %lu, %lu\n", va_size(va), va->subtree_max_size); } } #endif /* * This function populates subtree_max_size from bottom to upper * levels starting from VA point. The propagation must be done * when VA size is modified by changing its va_start/va_end. Or * in case of newly inserting of VA to the tree. * * It means that __augment_tree_propagate_from() must be called: * - After VA has been inserted to the tree(free path); * - After VA has been shrunk(allocation path); * - After VA has been increased(merging path). * * Please note that, it does not mean that upper parent nodes * and their subtree_max_size are recalculated all the time up * to the root node. * * 4--8 * /\ * / \ * / \ * 2--2 8--8 * * For example if we modify the node 4, shrinking it to 2, then * no any modification is required. If we shrink the node 2 to 1 * its subtree_max_size is updated only, and set to 1. If we shrink * the node 8 to 6, then its subtree_max_size is set to 6 and parent * node becomes 4--6. */ static __always_inline void augment_tree_propagate_from(struct vmap_area *va) { /* * Populate the tree from bottom towards the root until * the calculated maximum available size of checked node * is equal to its current one. */ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); #if DEBUG_AUGMENT_PROPAGATE_CHECK augment_tree_propagate_check(); #endif } static void insert_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { struct rb_node **link; struct rb_node *parent; link = find_va_links(va, root, NULL, &parent); if (link) link_va(va, root, parent, link, head); } static void insert_vmap_area_augment(struct vmap_area *va, struct rb_node *from, struct rb_root *root, struct list_head *head) { struct rb_node **link; struct rb_node *parent; if (from) link = find_va_links(va, NULL, from, &parent); else link = find_va_links(va, root, NULL, &parent); if (link) { link_va_augment(va, root, parent, link, head); augment_tree_propagate_from(va); } } /* * Merge de-allocated chunk of VA memory with previous * and next free blocks. If coalesce is not done a new * free area is inserted. If VA has been merged, it is * freed. * * Please note, it can return NULL in case of overlap * ranges, followed by WARN() report. Despite it is a * buggy behaviour, a system can be alive and keep * ongoing. */ static __always_inline struct vmap_area * __merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head, bool augment) { struct vmap_area *sibling; struct list_head *next; struct rb_node **link; struct rb_node *parent; bool merged = false; /* * Find a place in the tree where VA potentially will be * inserted, unless it is merged with its sibling/siblings. */ link = find_va_links(va, root, NULL, &parent); if (!link) return NULL; /* * Get next node of VA to check if merging can be done. */ next = get_va_next_sibling(parent, link); if (unlikely(next == NULL)) goto insert; /* * start end * | | * |<------VA------>|<-----Next----->| * | | * start end */ if (next != head) { sibling = list_entry(next, struct vmap_area, list); if (sibling->va_start == va->va_end) { sibling->va_start = va->va_start; /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); /* Point to the new merged area. */ va = sibling; merged = true; } } /* * start end * | | * |<-----Prev----->|<------VA------>| * | | * start end */ if (next->prev != head) { sibling = list_entry(next->prev, struct vmap_area, list); if (sibling->va_end == va->va_start) { /* * If both neighbors are coalesced, it is important * to unlink the "next" node first, followed by merging * with "previous" one. Otherwise the tree might not be * fully populated if a sibling's augmented value is * "normalized" because of rotation operations. */ if (merged) __unlink_va(va, root, augment); sibling->va_end = va->va_end; /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); /* Point to the new merged area. */ va = sibling; merged = true; } } insert: if (!merged) __link_va(va, root, parent, link, head, augment); return va; } static __always_inline struct vmap_area * merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { return __merge_or_add_vmap_area(va, root, head, false); } static __always_inline struct vmap_area * merge_or_add_vmap_area_augment(struct vmap_area *va, struct rb_root *root, struct list_head *head) { va = __merge_or_add_vmap_area(va, root, head, true); if (va) augment_tree_propagate_from(va); return va; } static __always_inline bool is_within_this_va(struct vmap_area *va, unsigned long size, unsigned long align, unsigned long vstart) { unsigned long nva_start_addr; if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align); else nva_start_addr = ALIGN(vstart, align); /* Can be overflowed due to big size or alignment. */ if (nva_start_addr + size < nva_start_addr || nva_start_addr < vstart) return false; return (nva_start_addr + size <= va->va_end); } /* * Find the first free block(lowest start address) in the tree, * that will accomplish the request corresponding to passing * parameters. Please note, with an alignment bigger than PAGE_SIZE, * a search length is adjusted to account for worst case alignment * overhead. */ static __always_inline struct vmap_area * find_vmap_lowest_match(struct rb_root *root, unsigned long size, unsigned long align, unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; unsigned long length; /* Start from the root. */ node = root->rb_node; /* Adjust the search size for alignment overhead. */ length = adjust_search_size ? size + align - 1 : size; while (node) { va = rb_entry(node, struct vmap_area, rb_node); if (get_subtree_max_size(node->rb_left) >= length && vstart < va->va_start) { node = node->rb_left; } else { if (is_within_this_va(va, size, align, vstart)) return va; /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is * equal or bigger to the requested search length. */ if (get_subtree_max_size(node->rb_right) >= length) { node = node->rb_right; continue; } /* * OK. We roll back and find the first right sub-tree, * that will satisfy the search criteria. It can happen * due to "vstart" restriction or an alignment overhead * that is bigger then PAGE_SIZE. */ while ((node = rb_parent(node))) { va = rb_entry(node, struct vmap_area, rb_node); if (is_within_this_va(va, size, align, vstart)) return va; if (get_subtree_max_size(node->rb_right) >= length && vstart <= va->va_start) { /* * Shift the vstart forward. Please note, we update it with * parent's start address adding "1" because we do not want * to enter same sub-tree after it has already been checked * and no suitable free block found there. */ vstart = va->va_start + 1; node = node->rb_right; break; } } } } return NULL; } #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK #include <linux/random.h> static struct vmap_area * find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart) { struct vmap_area *va; list_for_each_entry(va, head, list) { if (!is_within_this_va(va, size, align, vstart)) continue; return va; } return NULL; } static void find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; unsigned int rnd; get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; va_1 = find_vmap_lowest_match(root, size, align, vstart, false); va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", va_1, va_2, vstart); } #endif enum fit_type { NOTHING_FIT = 0, FL_FIT_TYPE = 1, /* full fit */ LE_FIT_TYPE = 2, /* left edge fit */ RE_FIT_TYPE = 3, /* right edge fit */ NE_FIT_TYPE = 4 /* no edge fit */ }; static __always_inline enum fit_type classify_va_fit_type(struct vmap_area *va, unsigned long nva_start_addr, unsigned long size) { enum fit_type type; /* Check if it is within VA. */ if (nva_start_addr < va->va_start || nva_start_addr + size > va->va_end) return NOTHING_FIT; /* Now classify. */ if (va->va_start == nva_start_addr) { if (va->va_end == nva_start_addr + size) type = FL_FIT_TYPE; else type = LE_FIT_TYPE; } else if (va->va_end == nva_start_addr + size) { type = RE_FIT_TYPE; } else { type = NE_FIT_TYPE; } return type; } static __always_inline int va_clip(struct rb_root *root, struct list_head *head, struct vmap_area *va, unsigned long nva_start_addr, unsigned long size) { struct vmap_area *lva = NULL; enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); if (type == FL_FIT_TYPE) { /* * No need to split VA, it fully fits. * * | | * V NVA V * |---------------| */ unlink_va_augment(va, root); kmem_cache_free(vmap_area_cachep, va); } else if (type == LE_FIT_TYPE) { /* * Split left edge of fit VA. * * | | * V NVA V R * |-------|-------| */ va->va_start += size; } else if (type == RE_FIT_TYPE) { /* * Split right edge of fit VA. * * | | * L V NVA V * |-------|-------| */ va->va_end = nva_start_addr; } else if (type == NE_FIT_TYPE) { /* * Split no edge of fit VA. * * | | * L V NVA V R * |---|-------|---| */ lva = __this_cpu_xchg(ne_fit_preload_node, NULL); if (unlikely(!lva)) { /* * For percpu allocator we do not do any pre-allocation * and leave it as it is. The reason is it most likely * never ends up with NE_FIT_TYPE splitting. In case of * percpu allocations offsets and sizes are aligned to * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE * are its main fitting cases. * * There are a few exceptions though, as an example it is * a first allocation (early boot up) when we have "one" * big free space that has to be split. * * Also we can hit this path in case of regular "vmap" * allocations, if "this" current CPU was not preloaded. * See the comment in alloc_vmap_area() why. If so, then * GFP_NOWAIT is used instead to get an extra object for * split purpose. That is rare and most time does not * occur. * * What happens if an allocation gets failed. Basically, * an "overflow" path is triggered to purge lazily freed * areas to free some memory, then, the "retry" path is * triggered to repeat one more time. See more details * in alloc_vmap_area() function. */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) return -1; } /* * Build the remainder. */ lva->va_start = va->va_start; lva->va_end = nva_start_addr; /* * Shrink this VA to remaining size. */ va->va_start = nva_start_addr + size; } else { return -1; } if (type != FL_FIT_TYPE) { augment_tree_propagate_from(va); if (lva) /* type == NE_FIT_TYPE */ insert_vmap_area_augment(lva, &va->rb_node, root, head); } return 0; } static unsigned long va_alloc(struct vmap_area *va, struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { unsigned long nva_start_addr; int ret; if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align); else nva_start_addr = ALIGN(vstart, align); /* Check the "vend" restriction. */ if (nva_start_addr + size > vend) return vend; /* Update the free vmap_area. */ ret = va_clip(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) return vend; return nva_start_addr; } /* * Returns a start address of the newly allocated area, if success. * Otherwise a vend is returned that indicates failure. */ static __always_inline unsigned long __alloc_vmap_area(struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; /* * Do not adjust when: * a) align <= PAGE_SIZE, because it does not make any sense. * All blocks(their start addresses) are at least PAGE_SIZE * aligned anyway; * b) a short range where a requested size corresponds to exactly * specified [vstart:vend] interval and an alignment > PAGE_SIZE. * With adjusted search length an allocation would not succeed. */ if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) adjust_search_size = false; va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend); if (nva_start_addr == vend) return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; } /* * Free a region of KVA allocated by alloc_vmap_area */ static void free_vmap_area(struct vmap_area *va) { struct vmap_node *vn = addr_to_node(va->va_start); /* * Remove from the busy tree/list. */ spin_lock(&vn->busy.lock); unlink_va(va, &vn->busy.root); spin_unlock(&vn->busy.lock); /* * Insert/Merge it back to the free tree/list. */ spin_lock(&free_vmap_area_lock); merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); spin_unlock(&free_vmap_area_lock); } static inline void preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) { struct vmap_area *va = NULL, *tmp; /* * Preload this CPU with one extra vmap_area object. It is used * when fit type of free area is NE_FIT_TYPE. It guarantees that * a CPU that does an allocation is preloaded. * * We do it in non-atomic context, thus it allows us to use more * permissive allocation masks to be more stable under low memory * condition and high memory pressure. */ if (!this_cpu_read(ne_fit_preload_node)) va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); spin_lock(lock); tmp = NULL; if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va)) kmem_cache_free(vmap_area_cachep, va); } static struct vmap_pool * size_to_va_pool(struct vmap_node *vn, unsigned long size) { unsigned int idx = (size - 1) / PAGE_SIZE; if (idx < MAX_VA_SIZE_PAGES) return &vn->pool[idx]; return NULL; } static bool node_pool_add_va(struct vmap_node *n, struct vmap_area *va) { struct vmap_pool *vp; vp = size_to_va_pool(n, va_size(va)); if (!vp) return false; spin_lock(&n->pool_lock); list_add(&va->list, &vp->head); WRITE_ONCE(vp->len, vp->len + 1); spin_unlock(&n->pool_lock); return true; } static struct vmap_area * node_pool_del_va(struct vmap_node *vn, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { struct vmap_area *va = NULL; struct vmap_pool *vp; int err = 0; vp = size_to_va_pool(vn, size); if (!vp || list_empty(&vp->head)) return NULL; spin_lock(&vn->pool_lock); if (!list_empty(&vp->head)) { va = list_first_entry(&vp->head, struct vmap_area, list); if (IS_ALIGNED(va->va_start, align)) { /* * Do some sanity check and emit a warning * if one of below checks detects an error. */ err |= (va_size(va) != size); err |= (va->va_start < vstart); err |= (va->va_end > vend); if (!WARN_ON_ONCE(err)) { list_del_init(&va->list); WRITE_ONCE(vp->len, vp->len - 1); } else { va = NULL; } } else { list_move_tail(&va->list, &vp->head); va = NULL; } } spin_unlock(&vn->pool_lock); return va; } static struct vmap_area * node_alloc(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, unsigned long *addr, unsigned int *vn_id) { struct vmap_area *va; *vn_id = 0; *addr = vend; /* * Fallback to a global heap if not vmalloc or there * is only one node. */ if (vstart != VMALLOC_START || vend != VMALLOC_END || nr_vmap_nodes == 1) return NULL; *vn_id = raw_smp_processor_id() % nr_vmap_nodes; va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend); *vn_id = encode_vn_id(*vn_id); if (va) *addr = va->va_start; return va; } static inline void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va_size(va); vm->caller = caller; va->vm = vm; } /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. If vm is passed in, the two will also be bound. */ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask, unsigned long va_flags, struct vm_struct *vm) { struct vmap_node *vn; struct vmap_area *va; unsigned long freed; unsigned long addr; unsigned int vn_id; int purged = 0; int ret; if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) return ERR_PTR(-EINVAL); if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); might_sleep(); /* * If a VA is obtained from a global heap(if it fails here) * it is anyway marked with this "vn_id" so it is returned * to this pool's node later. Such way gives a possibility * to populate pools based on users demand. * * On success a ready to go VA is returned. */ va = node_alloc(size, align, vstart, vend, &addr, &vn_id); if (!va) { gfp_mask = gfp_mask & GFP_RECLAIM_MASK; va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); /* * Only scan the relevant parts containing pointers to other objects * to avoid false negatives. */ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); } retry: if (addr == vend) { preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); } trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->vm = NULL; va->flags = (va_flags | vn_id); if (vm) { vm->addr = (void *)va->va_start; vm->size = va_size(va); va->vm = vm; } vn = addr_to_node(va->va_start); spin_lock(&vn->busy.lock); insert_vmap_area(va, &vn->busy.root, &vn->busy.head); spin_unlock(&vn->busy.lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); ret = kasan_populate_vmalloc(addr, size); if (ret) { free_vmap_area(va); return ERR_PTR(ret); } return va; overflow: if (!purged) { reclaim_and_purge_vmap_areas(); purged = 1; goto retry; } freed = 0; blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); if (freed > 0) { purged = 0; goto retry; } if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n", size, vstart, vend); kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } int register_vmap_purge_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&vmap_notify_list, nb); } EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); int unregister_vmap_purge_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&vmap_notify_list, nb); } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * * There is a tradeoff here: a larger number will cover more kernel page tables * and take slightly longer to purge, but it will linearly reduce the number of * global TLB flushes that must be performed. It would seem natural to scale * this number up linearly with the number of CPUs (because vmapping activity * could also scale linearly with the number of CPUs), however it is likely * that in practice, workloads might be constrained in other ways that mean * vmap activity will not scale linearly with CPUs. Also, I want to be * conservative and not introduce a big latency on huge systems, so go with * a less aggressive log scale. It will still be an improvement over the old * code, and it will be simple to change the scale factor if we find that it * becomes a problem on bigger systems. */ static unsigned long lazy_max_pages(void) { unsigned int log; log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); } static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual critical section protected * by this lock, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic. */ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); static cpumask_t purge_nodes; static void reclaim_list_global(struct list_head *head) { struct vmap_area *va, *n; if (list_empty(head)) return; spin_lock(&free_vmap_area_lock); list_for_each_entry_safe(va, n, head, list) merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); spin_unlock(&free_vmap_area_lock); } static void decay_va_pool_node(struct vmap_node *vn, bool full_decay) { LIST_HEAD(decay_list); struct rb_root decay_root = RB_ROOT; struct vmap_area *va, *nva; unsigned long n_decay; int i; for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { LIST_HEAD(tmp_list); if (list_empty(&vn->pool[i].head)) continue; /* Detach the pool, so no-one can access it. */ spin_lock(&vn->pool_lock); list_replace_init(&vn->pool[i].head, &tmp_list); spin_unlock(&vn->pool_lock); if (full_decay) WRITE_ONCE(vn->pool[i].len, 0); /* Decay a pool by ~25% out of left objects. */ n_decay = vn->pool[i].len >> 2; list_for_each_entry_safe(va, nva, &tmp_list, list) { list_del_init(&va->list); merge_or_add_vmap_area(va, &decay_root, &decay_list); if (!full_decay) { WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1); if (!--n_decay) break; } } /* * Attach the pool back if it has been partly decayed. * Please note, it is supposed that nobody(other contexts) * can populate the pool therefore a simple list replace * operation takes place here. */ if (!full_decay && !list_empty(&tmp_list)) { spin_lock(&vn->pool_lock); list_replace_init(&tmp_list, &vn->pool[i].head); spin_unlock(&vn->pool_lock); } } reclaim_list_global(&decay_list); } static void kasan_release_vmalloc_node(struct vmap_node *vn) { struct vmap_area *va; unsigned long start, end; start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start; end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end; list_for_each_entry(va, &vn->purge_list, list) { if (is_vmalloc_or_module_addr((void *) va->va_start)) kasan_release_vmalloc(va->va_start, va->va_end, va->va_start, va->va_end, KASAN_VMALLOC_PAGE_RANGE); } kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH); } static void purge_vmap_node(struct work_struct *work) { struct vmap_node *vn = container_of(work, struct vmap_node, purge_work); unsigned long nr_purged_pages = 0; struct vmap_area *va, *n_va; LIST_HEAD(local_list); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) kasan_release_vmalloc_node(vn); vn->nr_purged = 0; list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { unsigned long nr = va_size(va) >> PAGE_SHIFT; unsigned int vn_id = decode_vn_id(va->flags); list_del_init(&va->list); nr_purged_pages += nr; vn->nr_purged++; if (is_vn_id_valid(vn_id) && !vn->skip_populate) if (node_pool_add_va(vn, va)) continue; /* Go back to global. */ list_add(&va->list, &local_list); } atomic_long_sub(nr_purged_pages, &vmap_lazy_nr); reclaim_list_global(&local_list); } /* * Purges all lazily-freed vmap areas. */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, bool full_pool_decay) { unsigned long nr_purged_areas = 0; unsigned int nr_purge_helpers; unsigned int nr_purge_nodes; struct vmap_node *vn; int i; lockdep_assert_held(&vmap_purge_lock); /* * Use cpumask to mark which node has to be processed. */ purge_nodes = CPU_MASK_NONE; for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; INIT_LIST_HEAD(&vn->purge_list); vn->skip_populate = full_pool_decay; decay_va_pool_node(vn, full_pool_decay); if (RB_EMPTY_ROOT(&vn->lazy.root)) continue; spin_lock(&vn->lazy.lock); WRITE_ONCE(vn->lazy.root.rb_node, NULL); list_replace_init(&vn->lazy.head, &vn->purge_list); spin_unlock(&vn->lazy.lock); start = min(start, list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start); end = max(end, list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end); cpumask_set_cpu(i, &purge_nodes); } nr_purge_nodes = cpumask_weight(&purge_nodes); if (nr_purge_nodes > 0) { flush_tlb_kernel_range(start, end); /* One extra worker is per a lazy_max_pages() full set minus one. */ nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages(); nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1; for_each_cpu(i, &purge_nodes) { vn = &vmap_nodes[i]; if (nr_purge_helpers > 0) { INIT_WORK(&vn->purge_work, purge_vmap_node); if (cpumask_test_cpu(i, cpu_online_mask)) schedule_work_on(i, &vn->purge_work); else schedule_work(&vn->purge_work); nr_purge_helpers--; } else { vn->purge_work.func = NULL; purge_vmap_node(&vn->purge_work); nr_purged_areas += vn->nr_purged; } } for_each_cpu(i, &purge_nodes) { vn = &vmap_nodes[i]; if (vn->purge_work.func) { flush_work(&vn->purge_work); nr_purged_areas += vn->nr_purged; } } } trace_purge_vmap_area_lazy(start, end, nr_purged_areas); return nr_purged_areas > 0; } /* * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list. */ static void reclaim_and_purge_vmap_areas(void) { mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); __purge_vmap_area_lazy(ULONG_MAX, 0, true); mutex_unlock(&vmap_purge_lock); } static void drain_vmap_area_work(struct work_struct *work) { mutex_lock(&vmap_purge_lock); __purge_vmap_area_lazy(ULONG_MAX, 0, false); mutex_unlock(&vmap_purge_lock); } /* * Free a vmap area, caller ensuring that the area has been unmapped, * unlinked and flush_cache_vunmap had been called for the correct * range previously. */ static void free_vmap_area_noflush(struct vmap_area *va) { unsigned long nr_lazy_max = lazy_max_pages(); unsigned long va_start = va->va_start; unsigned int vn_id = decode_vn_id(va->flags); struct vmap_node *vn; unsigned long nr_lazy; if (WARN_ON_ONCE(!list_empty(&va->list))) return; nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT, &vmap_lazy_nr); /* * If it was request by a certain node we would like to * return it to that node, i.e. its pool for later reuse. */ vn = is_vn_id_valid(vn_id) ? id_to_node(vn_id):addr_to_node(va->va_start); spin_lock(&vn->lazy.lock); insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head); spin_unlock(&vn->lazy.lock); trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); /* After this point, we may free va at any time */ if (unlikely(nr_lazy > nr_lazy_max)) schedule_work(&drain_vmap_work); } /* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); vunmap_range_noflush(va->va_start, va->va_end); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); free_vmap_area_noflush(va); } struct vmap_area *find_vmap_area(unsigned long addr) { struct vmap_node *vn; struct vmap_area *va; int i, j; if (unlikely(!vmap_initialized)) return NULL; /* * An addr_to_node_id(addr) converts an address to a node index * where a VA is located. If VA spans several zones and passed * addr is not the same as va->va_start, what is not common, we * may need to scan extra nodes. See an example: * * <----va----> * -|-----|-----|-----|-----|- * 1 2 0 1 * * VA resides in node 1 whereas it spans 1, 2 an 0. If passed * addr is within 2 or 0 nodes we should do extra work. */ i = j = addr_to_node_id(addr); do { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); va = __find_vmap_area(addr, &vn->busy.root); spin_unlock(&vn->busy.lock); if (va) return va; } while ((i = (i + 1) % nr_vmap_nodes) != j); return NULL; } static struct vmap_area *find_unlink_vmap_area(unsigned long addr) { struct vmap_node *vn; struct vmap_area *va; int i, j; /* * Check the comment in the find_vmap_area() about the loop. */ i = j = addr_to_node_id(addr); do { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); va = __find_vmap_area(addr, &vn->busy.root); if (va) unlink_va(va, &vn->busy.root); spin_unlock(&vn->busy.lock); if (va) return va; } while ((i = (i + 1) % nr_vmap_nodes) != j); return NULL; } /*** Per cpu kva allocator ***/ /* * vmap space is limited especially on 32 bit architectures. Ensure there is * room for at least 16 percpu vmap blocks per CPU. */ /* * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess * instead (we just need a rough idea) */ #if BITS_PER_LONG == 32 #define VMALLOC_SPACE (128UL*1024*1024) #else #define VMALLOC_SPACE (128UL*1024*1024*1024) #endif #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ #define VMAP_BBMAP_BITS \ VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) /* * Purge threshold to prevent overeager purging of fragmented blocks for * regular operations: Purge if vb->free is less than 1/4 of the capacity. */ #define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4) #define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ #define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ #define VMAP_FLAGS_MASK 0x3 struct vmap_block_queue { spinlock_t lock; struct list_head free; /* * An xarray requires an extra memory dynamically to * be allocated. If it is an issue, we can use rb-tree * instead. */ struct xarray vmap_blocks; }; struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; struct list_head purge; unsigned int cpu; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); /* * In order to fast access to any "vmap_block" associated with a * specific address, we use a hash. * * A per-cpu vmap_block_queue is used in both ways, to serialize * an access to free block chains among CPUs(alloc path) and it * also acts as a vmap_block hash(alloc/free paths). It means we * overload it, since we already have the per-cpu array which is * used as a hash table. When used as a hash a 'cpu' passed to * per_cpu() is not actually a CPU but rather a hash index. * * A hash function is addr_to_vb_xa() which hashes any address * to a specific index(in a hash) it belongs to. This then uses a * per_cpu() macro to access an array with generated index. * * An example: * * CPU_1 CPU_2 CPU_0 * | | | * V V V * 0 10 20 30 40 50 60 * |------|------|------|------|------|------|...<vmap address space> * CPU0 CPU1 CPU2 CPU0 CPU1 CPU2 * * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus * it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock; * * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus * it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock; * * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus * it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock. * * This technique almost always avoids lock contention on insert/remove, * however xarray spinlocks protect against any contention that remains. */ static struct xarray * addr_to_vb_xa(unsigned long addr) { int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids; /* * Please note, nr_cpu_ids points on a highest set * possible bit, i.e. we never invoke cpumask_next() * if an index points on it which is nr_cpu_ids - 1. */ if (!cpu_possible(index)) index = cpumask_next(index, cpu_possible_mask); return &per_cpu(vmap_block_queue, index).vmap_blocks; } /* * We should probably have a fallback mechanism to allocate virtual memory * out of partially filled vmap blocks. However vmap block sizing should be * fairly reasonable according to the vmalloc size, so it shouldn't be a * big problem. */ static unsigned long addr_to_vb_idx(unsigned long addr) { addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); addr /= VMAP_BLOCK_SIZE; return addr; } static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) { unsigned long addr; addr = va_start + (pages_off << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); return (void *)addr; } /** * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this * block. Of course pages number can't exceed VMAP_BBMAP_BITS * @order: how many 2^order pages should be occupied in newly allocated block * @gfp_mask: flags for the page level allocator * * Return: virtual address in a newly allocated block or ERR_PTR(-errno) */ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; struct vmap_area *va; struct xarray *xa; unsigned long vb_idx; int node, err; void *vaddr; node = numa_node_id(); vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!vb)) return ERR_PTR(-ENOMEM); va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask, VMAP_RAM|VMAP_BLOCK, NULL); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } vaddr = vmap_block_vaddr(va->va_start, 0); spin_lock_init(&vb->lock); vb->va = va; /* At least something should be left free */ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); bitmap_zero(vb->used_map, VMAP_BBMAP_BITS); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); vb->cpu = raw_smp_processor_id(); xa = addr_to_vb_xa(va->va_start); vb_idx = addr_to_vb_idx(va->va_start); err = xa_insert(xa, vb_idx, vb, gfp_mask); if (err) { kfree(vb); free_vmap_area(va); return ERR_PTR(err); } /* * list_add_tail_rcu could happened in another core * rather than vb->cpu due to task migration, which * is safe as list_add_tail_rcu will ensure the list's * integrity together with list_for_each_rcu from read * side. */ vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); return vaddr; } static void free_vmap_block(struct vmap_block *vb) { struct vmap_node *vn; struct vmap_block *tmp; struct xarray *xa; xa = addr_to_vb_xa(vb->va->va_start); tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); vn = addr_to_node(vb->va->va_start); spin_lock(&vn->busy.lock); unlink_va(vb->va, &vn->busy.root); spin_unlock(&vn->busy.lock); free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); } static bool purge_fragmented_block(struct vmap_block *vb, struct list_head *purge_list, bool force_purge) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu); if (vb->free + vb->dirty != VMAP_BBMAP_BITS || vb->dirty == VMAP_BBMAP_BITS) return false; /* Don't overeagerly purge usable blocks unless requested */ if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD)) return false; /* prevent further allocs after releasing lock */ WRITE_ONCE(vb->free, 0); /* prevent purging it again */ WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS); vb->dirty_min = 0; vb->dirty_max = VMAP_BBMAP_BITS; spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); list_add_tail(&vb->purge, purge_list); return true; } static void free_purged_blocks(struct list_head *purge_list) { struct vmap_block *vb, *n_vb; list_for_each_entry_safe(vb, n_vb, purge_list, purge) { list_del(&vb->purge); free_vmap_block(vb); } } static void purge_fragmented_blocks(int cpu) { LIST_HEAD(purge); struct vmap_block *vb; struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long free = READ_ONCE(vb->free); unsigned long dirty = READ_ONCE(vb->dirty); if (free + dirty != VMAP_BBMAP_BITS || dirty == VMAP_BBMAP_BITS) continue; spin_lock(&vb->lock); purge_fragmented_block(vb, &purge, true); spin_unlock(&vb->lock); } rcu_read_unlock(); free_purged_blocks(&purge); } static void purge_fragmented_blocks_allcpus(void) { int cpu; for_each_possible_cpu(cpu) purge_fragmented_blocks(cpu); } static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; void *vaddr = NULL; unsigned int order; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); if (WARN_ON(size == 0)) { /* * Allocating 0 bytes isn't what caller wants since * get_order(0) returns funny result. Just warn and terminate * early. */ return ERR_PTR(-EINVAL); } order = get_order(size); rcu_read_lock(); vbq = raw_cpu_ptr(&vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; if (READ_ONCE(vb->free) < (1UL << order)) continue; spin_lock(&vb->lock); if (vb->free < (1UL << order)) { spin_unlock(&vb->lock); continue; } pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); WRITE_ONCE(vb->free, vb->free - (1UL << order)); bitmap_set(vb->used_map, pages_off, (1UL << order)); if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); } spin_unlock(&vb->lock); break; } rcu_read_unlock(); /* Allocate new block if nothing was found */ if (!vaddr) vaddr = new_vmap_block(order, gfp_mask); return vaddr; } static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned int order; struct vmap_block *vb; struct xarray *xa; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); flush_cache_vunmap(addr, addr + size); order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; xa = addr_to_vb_xa(addr); vb = xa_load(xa, addr_to_vb_idx(addr)); spin_lock(&vb->lock); bitmap_clear(vb->used_map, offset, (1UL << order)); spin_unlock(&vb->lock); vunmap_range_noflush(addr, addr + size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); /* Expand the not yet TLB flushed dirty range */ vb->dirty_min = min(vb->dirty_min, offset); vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order)); if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else spin_unlock(&vb->lock); } static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { LIST_HEAD(purge_list); int cpu; if (unlikely(!vmap_initialized)) return; mutex_lock(&vmap_purge_lock); for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; unsigned long idx; rcu_read_lock(); xa_for_each(&vbq->vmap_blocks, idx, vb) { spin_lock(&vb->lock); /* * Try to purge a fragmented block first. If it's * not purgeable, check whether there is dirty * space to be flushed. */ if (!purge_fragmented_block(vb, &purge_list, false) && vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; s = va_start + (vb->dirty_min << PAGE_SHIFT); e = va_start + (vb->dirty_max << PAGE_SHIFT); start = min(s, start); end = max(e, end); /* Prevent that this is flushed again */ vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; flush = 1; } spin_unlock(&vb->lock); } rcu_read_unlock(); } free_purged_blocks(&purge_list); if (!__purge_vmap_area_lazy(start, end, false) && flush) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } /** * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer * * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily * to amortize TLB flushing overheads. What this means is that any page you * have now, may, in a former life, have been mapped into kernel virtual * address by the vmap layer and so there might be some CPUs with TLB entries * still referencing that page (additional to the regular 1:1 kernel mapping). * * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can * be sure that none of the pages we have control over will have any aliases * from the vmap layer. */ void vm_unmap_aliases(void) { unsigned long start = ULONG_MAX, end = 0; int flush = 0; _vm_unmap_aliases(start, end, flush); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); /** * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram * @mem: the pointer returned by vm_map_ram * @count: the count passed to that vm_map_ram call (cannot unmap partial) */ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); kasan_poison_vmalloc(mem, size); if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); vb_free(addr, size); return; } va = find_unlink_vmap_area(addr); if (WARN_ON_ONCE(!va)) return; debug_check_no_locks_freed((void *)va->va_start, va_size(va)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); /** * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life * objects with vm_map_ram(), it could consume lots of address space through * fragmentation (especially on a 32bit machine). You could see failures in * the end. Please use this function for short-lived objects. * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; void *mem; if (likely(count <= VMAP_MAX_ALLOC)) { mem = vb_alloc(size, GFP_KERNEL); if (IS_ERR(mem)) return NULL; addr = (unsigned long)mem; } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, node, GFP_KERNEL, VMAP_RAM, NULL); if (IS_ERR(va)) return NULL; addr = va->va_start; mem = (void *)addr; } if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } /* * Mark the pages as accessible, now that they are mapped. * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); return mem; } EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; static inline unsigned int vm_area_page_order(struct vm_struct *vm) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC return vm->page_order; #else return 0; #endif } unsigned int get_vm_area_page_order(struct vm_struct *vm) { return vm_area_page_order(vm); } static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC vm->page_order = order; #else BUG_ON(order != 0); #endif } /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add * * This function is used to add fixed kernel vm area to vmlist before * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags * should contain proper values and the other fields should be zero. * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ void __init vm_area_add_early(struct vm_struct *vm) { struct vm_struct *tmp, **p; BUG_ON(vmap_initialized); for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= vm->addr) { BUG_ON(tmp->addr < vm->addr + vm->size); break; } else BUG_ON(tmp->addr + tmp->size > vm->addr); } vm->next = *p; *p = vm; } /** * vm_area_register_early - register vmap area early during boot * @vm: vm_struct to register * @align: requested alignment * * This function is used to register kernel vm area before * vmalloc_init() is called. @vm->size and @vm->flags should contain * proper values on entry and other fields should be zero. On return, * vm->addr contains the allocated address. * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ void __init vm_area_register_early(struct vm_struct *vm, size_t align) { unsigned long addr = ALIGN(VMALLOC_START, align); struct vm_struct *cur, **p; BUG_ON(vmap_initialized); for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { if ((unsigned long)cur->addr - addr >= vm->size) break; addr = ALIGN((unsigned long)cur->addr + cur->size, align); } BUG_ON(addr > VMALLOC_END - vm->size); vm->addr = (void *)addr; vm->next = *p; *p = vm; kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } static void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. * Pair with smp_rmb() in show_numa_info(). */ smp_wmb(); vm->flags &= ~VM_UNINITIALIZED; } struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; unsigned long requested_size = size; BUG_ON(in_interrupt()); size = ALIGN(size, 1ul << shift); if (unlikely(!size)) return NULL; if (flags & VM_IOREMAP) align = 1ul << clamp_t(int, get_count_order_long(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; area->flags = flags; area->caller = caller; va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); if (IS_ERR(va)) { kfree(area); return NULL; } /* * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a * best-effort approach, as they can be mapped outside of vmalloc code. * For VM_ALLOC mappings, the pages are marked as accessible after * getting mapped in __vmalloc_node_range(). * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ if (!(flags & VM_ALLOC)) area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, KASAN_VMALLOC_PROT_NORMAL); return area; } struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, NUMA_NO_NODE, GFP_KERNEL, caller); } /** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * * Search an area of @size in the kernel virtual mapping area, * and reserved it for out purposes. Returns the area descriptor * on success or %NULL on failure. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); } /** * find_vm_area - find a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and return it. * It is up to the caller to do all required locking to keep the returned * pointer valid. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; va = find_vmap_area((unsigned long)addr); if (!va) return NULL; return va->vm; } /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and remove it. * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; struct vm_struct *vm; might_sleep(); if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", addr)) return NULL; va = find_unlink_vmap_area((unsigned long)addr); if (!va || !va->vm) return NULL; vm = va->vm; debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); kasan_free_module_shadow(vm); kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); free_unmap_vmap_area(va); return vm; } static inline void set_area_direct_map(const struct vm_struct *area, int (*set_direct_map)(struct page *page)) { int i; /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); } /* * Flush the vm mapping and reset the direct map. */ static void vm_reset_perms(struct vm_struct *area) { unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_dmap = 0; int i; /* * Find the start and end range of the direct mappings to make sure that * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { unsigned long page_size; page_size = PAGE_SIZE << page_order; start = min(addr, start); end = max(addr + page_size, end); flush_dmap = 1; } } /* * Set direct map to something invalid so that it won't be cached if * there are any accesses after the TLB flush, then flush the TLB and * reset the direct map permissions to the default. */ set_area_direct_map(area, set_direct_map_invalid_noflush); _vm_unmap_aliases(start, end, flush_dmap); set_area_direct_map(area, set_direct_map_default_noflush); } static void delayed_vfree_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); struct llist_node *t, *llnode; llist_for_each_safe(llnode, t, llist_del_all(&p->list)) vfree(llnode); } /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address * * This one is just like vfree() but can be called in any atomic context * except NMIs. */ void vfree_atomic(const void *addr) { struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); BUG_ON(in_nmi()); kmemleak_free(addr); /* * Use raw_cpu_ptr() because this can be called from preemptible * context. Preemption is absolutely fine here, because the llist_add() * implementation is lockless, so it works even if we are adding to * another cpu's list. schedule_work() should be fine with this too. */ if (addr && llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); } /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address * * Free the virtually continuous memory area starting at @addr, as obtained * from one of the vmalloc() family of APIs. This will usually also free the * physical memory underlying the virtual allocation, but that memory is * reference counted, so it will not be freed until the last user goes away. * * If @addr is NULL, no operation is performed. * * Context: * May sleep if called *not* from interrupt context. * Must not be called in NMI context (strictly speaking, it could be * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-dependent would be a really bad idea). */ void vfree(const void *addr) { struct vm_struct *vm; int i; if (unlikely(in_interrupt())) { vfree_atomic(addr); return; } BUG_ON(in_nmi()); kmemleak_free(addr); might_sleep(); if (!addr) return; vm = remove_vm_area(addr); if (unlikely(!vm)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; BUG_ON(!page); mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations */ __free_page(page); cond_resched(); } atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } EXPORT_SYMBOL(vfree); /** * vunmap - release virtual mapping obtained by vmap() * @addr: memory base address * * Free the virtually contiguous memory area starting at @addr, * which was created from the page array passed to vmap(). * * Must not be called in interrupt context. */ void vunmap(const void *addr) { struct vm_struct *vm; BUG_ON(in_interrupt()); might_sleep(); if (!addr) return; vm = remove_vm_area(addr); if (unlikely(!vm)) { WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n", addr); return; } kfree(vm); } EXPORT_SYMBOL(vunmap); /** * vmap - map an array of pages into virtually contiguous space * @pages: array of page pointers * @count: number of pages to map * @flags: vm_area->flags * @prot: page protection for the mapping * * Maps @count pages from @pages into contiguous kernel virtual space. * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself * (which must be kmalloc or vmalloc memory) and one reference per pages in it * are transferred from the caller to vmap(), and will be freed / dropped when * vfree() is called on the return value. * * Return: the address of the area or %NULL on failure */ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { struct vm_struct *area; unsigned long addr; unsigned long size; /* In bytes */ might_sleep(); if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS)) return NULL; /* * Your top guard is someone else's bottom guard. Not having a top * guard compromises someone else's mappings too. */ if (WARN_ON_ONCE(flags & VM_NO_GUARD)) flags &= ~VM_NO_GUARD; if (count > totalram_pages()) return NULL; size = (unsigned long)count << PAGE_SHIFT; area = get_vm_area_caller(size, flags, __builtin_return_address(0)); if (!area) return NULL; addr = (unsigned long)area->addr; if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), pages, PAGE_SHIFT) < 0) { vunmap(area->addr); return NULL; } if (flags & VM_MAP_PUT_PAGES) { area->pages = pages; area->nr_pages = count; } return area->addr; } EXPORT_SYMBOL(vmap); #ifdef CONFIG_VMAP_PFN struct vmap_pfn_data { unsigned long *pfns; pgprot_t prot; unsigned int idx; }; static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) { struct vmap_pfn_data *data = private; unsigned long pfn = data->pfns[data->idx]; pte_t ptent; if (WARN_ON_ONCE(pfn_valid(pfn))) return -EINVAL; ptent = pte_mkspecial(pfn_pte(pfn, data->prot)); set_pte_at(&init_mm, addr, pte, ptent); data->idx++; return 0; } /** * vmap_pfn - map an array of PFNs into virtually contiguous space * @pfns: array of PFNs * @count: number of pages to map * @prot: page protection for the mapping * * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns * the start address of the mapping. */ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) { struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; struct vm_struct *area; area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, __builtin_return_address(0)); if (!area) return NULL; if (apply_to_page_range(&init_mm, (unsigned long)area->addr, count * PAGE_SIZE, vmap_pfn_apply, &data)) { free_vm_area(area); return NULL; } flush_cache_vmap((unsigned long)area->addr, (unsigned long)area->addr + count * PAGE_SIZE); return area->addr; } EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static inline unsigned int vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; struct page *page; int i; /* * For order-0 pages we make use of bulk allocator, if * the page array is partly or not at all populated due * to fails, fallback to a single page allocator that is * more permissive. */ if (!order) { while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; /* * A maximum allowed request is hard-coded and is 100 * pages per call. That is done in order to prevent a * long preemption off scenario in the bulk-allocator * so the range is [1:100]. */ nr_pages_request = min(100U, nr_pages - nr_allocated); /* memory allocation should consider mempolicy, we can't * wrongly use nearest node when nid == NUMA_NO_NODE, * otherwise memory may be allocated in only one node, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) nr = alloc_pages_bulk_array_mempolicy_noprof(gfp, nr_pages_request, pages + nr_allocated); else nr = alloc_pages_bulk_array_node_noprof(gfp, nid, nr_pages_request, pages + nr_allocated); nr_allocated += nr; cond_resched(); /* * If zero or pages were obtained partly, * fallback to a single page allocator. */ if (nr != nr_pages_request) break; } } /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) page = alloc_pages_noprof(gfp, order); else page = alloc_pages_node_noprof(nid, gfp, order); if (unlikely(!page)) break; /* * High-order allocations must be able to be treated as * independent small pages by callers (as they can with * small-page vmallocs). Some drivers do their own refcounting * on vmalloc_to_page() pages, some use page->mapping, * page->lru, etc. */ if (order) split_page(page, order); /* * Careful, we allocate and map page-order pages, but * tracking is done per PAGE_SIZE page so as to keep the * vm_struct APIs independent of the physical/mapped size. */ for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; cond_resched(); nr_allocated += 1U << order; } return nr_allocated; } static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; unsigned int flags; int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node, area->caller); } else { area->pages = kmalloc_node_noprof(array_size, nested_gfp, node); } if (!area->pages) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); return NULL; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); /* * High-order nofail allocations are really expensive and * potentially dangerous (pre-mature OOM, disruptive reclaim * and compaction etc. * * Please note, the __vmalloc_node_range_noprof() falls-back * to order-0 pages if high-order attempt is unsuccessful. */ area->nr_pages = vm_area_alloc_pages((page_order ? gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (gfp_mask & __GFP_ACCOUNT) { int i; for (i = 0; i < area->nr_pages; i++) mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); } /* * If not enough pages were obtained to accomplish an * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { /* * vm_area_alloc_pages() can fail due to insufficient memory but * also:- * * - a pending fatal signal * - insufficient huge page-order pages * * Since we always retry allocations at order-0 in the huge page * case a warning for either is spurious. */ if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocate pages", area->nr_pages * PAGE_SIZE); goto fail; } /* * page tables allocations ignore external gfp mask, enforce it * by the scope API */ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) flags = memalloc_nofs_save(); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) flags = memalloc_noio_save(); do { ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) memalloc_nofs_restore(flags); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) memalloc_noio_restore(flags); if (ret < 0) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; } return area->addr; fail: vfree(area->addr); return NULL; } /** * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @start: vm area range start * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Please note that the full set of gfp * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all * supported. * Zone modifiers are not supported. From the reclaim modifiers * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and * __GFP_RETRY_MAYFAIL are not supported). * * __GFP_NOWARN can be used to suppress failures messages. * * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. * * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { struct vm_struct *area; void *ret; kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; if (WARN_ON_ONCE(!size)) return NULL; if ((size >> PAGE_SHIFT) > totalram_pages()) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, exceeds total pages", real_size); return NULL; } if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) { /* * Try huge pages. Only try for PAGE_KERNEL allocations, * others like modules don't yet expect huge pages in * their allocations due to apply_to_page_range not * supporting them. */ if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE) shift = PMD_SHIFT; else shift = arch_vmap_pte_supported_shift(size); align = max(real_align, 1UL << shift); size = ALIGN(real_size, 1UL << shift); } again: area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, vm_struct allocation failed%s", real_size, (nofail) ? ". Retrying." : ""); if (nofail) { schedule_timeout_uninterruptible(1); goto again; } goto fail; } /* * Prepare arguments for __vmalloc_area_node() and * kasan_unpoison_vmalloc(). */ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { if (kasan_hw_tags_enabled()) { /* * Modify protection bits to allow tagging. * This must be done before mapping. */ prot = arch_vmap_pgprot_tagged(prot); /* * Skip page_alloc poisoning and zeroing for physical * pages backing VM_ALLOC mapping. Memory is instead * poisoned and zeroed by kasan_unpoison_vmalloc(). */ gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO; } /* Take note that the mapping is PAGE_KERNEL. */ kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; } /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; /* * Mark the pages as accessible, now that they are mapped. * The condition for setting KASAN_VMALLOC_INIT should complement the * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check * to make sure that memory is initialized under the same conditions. * Tag-based KASAN modes only assign tags to normal non-executable * allocations, see __kasan_unpoison_vmalloc(). */ kasan_flags |= KASAN_VMALLOC_VM_ALLOC; if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. */ clear_vm_uninitialized_flag(area); size = PAGE_ALIGN(size); if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, size, gfp_mask); return area->addr; fail: if (shift > PAGE_SHIFT) { shift = PAGE_SHIFT; align = real_align; size = real_size; goto again; } return NULL; } /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level allocator with * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported * * Any use of gfp flags outside of GFP_KERNEL should be consulted * with mm people. * * Return: pointer to the allocated memory or %NULL on error */ void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } /* * This is only for performance analysis of vmalloc and stress purpose. * It is required by vmalloc test module, therefore do not use it other * than that. */ #ifdef CONFIG_TEST_VMALLOC_MODULE EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); #endif void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc_noprof); /** * vmalloc - allocate virtually contiguous memory * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_noprof(unsigned long size) { return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_noprof); /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages * @size: allocation size * @gfp_mask: flags for the page level allocator * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * If @size is greater than or equal to PMD_SIZE, allow using * huge pages for the memory * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) { return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); /** * vzalloc - allocate virtually contiguous memory with zero fill * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_noprof(unsigned long size) { return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_noprof); /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size * * The resulting memory area is zeroed so it can be mapped to userspace * without leaking data. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_user_noprof(unsigned long size) { return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_user_noprof); /** * vmalloc_node - allocate memory on a specific node * @size: allocation size * @node: numa node * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_node_noprof(unsigned long size, int node) { return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node_noprof); /** * vzalloc_node - allocate memory on a specific node with zero fill * @size: allocation size * @node: numa node * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node_noprof(unsigned long size, int node) { return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node_noprof); /** * vrealloc - reallocate virtually contiguous memory; contents remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate * @flags: the flags for the page level allocator * * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and * @p is not a %NULL pointer, the object pointed to is freed. * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that * __GFP_ZERO is not fully honored by this API. * * In any case, the contents of the object pointed to are preserved up to the * lesser of the new and old sizes. * * This function must not be called concurrently with itself or vfree() for the * same memory allocation. * * Return: pointer to the allocated memory; %NULL if @size is zero or in case of * failure */ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) { size_t old_size = 0; void *n; if (!size) { vfree(p); return NULL; } if (p) { struct vm_struct *vm; vm = find_vm_area(p); if (unlikely(!vm)) { WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); return NULL; } old_size = get_vm_area_size(vm); } /* * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What * would be a good heuristic for when to shrink the vm_area? */ if (size <= old_size) { /* Zero out spare memory. */ if (want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); kasan_poison_vmalloc(p + size, old_size - size); kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL); return (void *)p; } /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ n = __vmalloc_noprof(size, flags); if (!n) return NULL; if (p) { memcpy(n, p, old_size); vfree(p); } return n; } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) #else /* * 64b systems should always have either DMA or DMA32 zones. For others * GFP_DMA32 should do the right thing and use the normal zone. */ #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #endif /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the * page level allocator and map them into contiguous kernel virtual space. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32_noprof(unsigned long size) { return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_noprof); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory * @size: allocation size * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32_user_noprof(unsigned long size) { return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_user_noprof); /* * Atomically zero bytes in the iterator. * * Returns the number of zeroed bytes. */ static size_t zero_iter(struct iov_iter *iter, size_t count) { size_t remains = count; while (remains > 0) { size_t num, copied; num = min_t(size_t, remains, PAGE_SIZE); copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter); remains -= copied; if (copied < num) break; } return count - remains; } /* * small helper routine, copy contents to iter from addr. * If the page is not present, fill zero. * * Returns the number of copied bytes. */ static size_t aligned_vread_iter(struct iov_iter *iter, const char *addr, size_t count) { size_t remains = count; struct page *page; while (remains > 0) { unsigned long offset, length; size_t copied = 0; offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > remains) length = remains; page = vmalloc_to_page(addr); /* * To do safe access to this _mapped_ area, we need lock. But * adding lock here means that we need to add overhead of * vmalloc()/vfree() calls for this _debug_ interface, rarely * used. Instead of that, we'll use an local mapping via * copy_page_to_iter_nofault() and accept a small overhead in * this access function. */ if (page) copied = copy_page_to_iter_nofault(page, offset, length, iter); else copied = zero_iter(iter, length); addr += copied; remains -= copied; if (copied != length) break; } return count - remains; } /* * Read from a vm_map_ram region of memory. * * Returns the number of copied bytes. */ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, size_t count, unsigned long flags) { char *start; struct vmap_block *vb; struct xarray *xa; unsigned long offset; unsigned int rs, re; size_t remains, n; /* * If it's area created by vm_map_ram() interface directly, but * not further subdividing and delegating management to vmap_block, * handle it here. */ if (!(flags & VMAP_BLOCK)) return aligned_vread_iter(iter, addr, count); remains = count; /* * Area is split into regions and tracked with vmap_block, read out * each region and zero fill the hole between regions. */ xa = addr_to_vb_xa((unsigned long) addr); vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr)); if (!vb) goto finished_zero; spin_lock(&vb->lock); if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { spin_unlock(&vb->lock); goto finished_zero; } for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { size_t copied; if (remains == 0) goto finished; start = vmap_block_vaddr(vb->va->va_start, rs); if (addr < start) { size_t to_zero = min_t(size_t, start - addr, remains); size_t zeroed = zero_iter(iter, to_zero); addr += zeroed; remains -= zeroed; if (remains == 0 || zeroed != to_zero) goto finished; } /*it could start reading from the middle of used region*/ offset = offset_in_page(addr); n = ((re - rs + 1) << PAGE_SHIFT) - offset; if (n > remains) n = remains; copied = aligned_vread_iter(iter, start + offset, n); addr += copied; remains -= copied; if (copied != n) goto finished; } spin_unlock(&vb->lock); finished_zero: /* zero-fill the left dirty or free regions */ return count - remains + zero_iter(iter, remains); finished: /* We couldn't copy/zero everything */ spin_unlock(&vb->lock); return count - remains; } /** * vread_iter() - read vmalloc area in a safe way to an iterator. * @iter: the iterator to which data should be written. * @addr: vm address. * @count: number of bytes to be read. * * This function checks that addr is a valid vmalloc'ed area, and * copy data from that area to a given buffer. If the given memory range * of [addr...addr+count) includes some valid address, data is copied to * proper area of @buf. If there are memory holes, they'll be zero-filled. * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive * vm_struct area, returns 0. @buf should be kernel's buffer. * * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any information, as /proc/kcore. * * Return: number of bytes for which addr and buf should be increased * (same number as @count) or %0 if [addr...addr+count) doesn't * include any intersection with valid vmalloc area */ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) { struct vmap_node *vn; struct vmap_area *va; struct vm_struct *vm; char *vaddr; size_t n, size, flags, remains; unsigned long next; addr = kasan_reset_tag(addr); /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; remains = count; vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va); if (!vn) goto finished_zero; /* no intersects with alive vmap_area */ if ((unsigned long)addr + remains <= va->va_start) goto finished_zero; do { size_t copied; if (remains == 0) goto finished; vm = va->vm; flags = va->flags & VMAP_FLAGS_MASK; /* * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need * be set together with VMAP_RAM. */ WARN_ON(flags == VMAP_BLOCK); if (!vm && !flags) goto next_va; if (vm && (vm->flags & VM_UNINITIALIZED)) goto next_va; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); vaddr = (char *) va->va_start; size = vm ? get_vm_area_size(vm) : va_size(va); if (addr >= vaddr + size) goto next_va; if (addr < vaddr) { size_t to_zero = min_t(size_t, vaddr - addr, remains); size_t zeroed = zero_iter(iter, to_zero); addr += zeroed; remains -= zeroed; if (remains == 0 || zeroed != to_zero) goto finished; } n = vaddr + size - addr; if (n > remains) n = remains; if (flags & VMAP_RAM) copied = vmap_ram_vread_iter(iter, addr, n, flags); else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE)))) copied = aligned_vread_iter(iter, addr, n); else /* IOREMAP | SPARSE area is treated as memory hole */ copied = zero_iter(iter, n); addr += copied; remains -= copied; if (copied != n) goto finished; next_va: next = va->va_end; spin_unlock(&vn->busy.lock); } while ((vn = find_vmap_area_exceed_addr_lock(next, &va))); finished_zero: if (vn) spin_unlock(&vn->busy.lock); /* zero-fill memory holes */ return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ if (vn) spin_unlock(&vn->busy.lock); return count - remains; } /** * remap_vmalloc_range_partial - map vmalloc pages to userspace * @vma: vma to cover * @uaddr: target user address to start at * @kaddr: virtual address of vmalloc kernel memory * @pgoff: offset from @kaddr to start at * @size: size of map area * * Returns: 0 for success, -Exxx on failure * * This function checks that @kaddr is a valid vmalloc'ed area, * and that it is big enough to cover the range starting at * @uaddr in @vma. Will return failure if that criteria isn't * met. * * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, void *kaddr, unsigned long pgoff, unsigned long size) { struct vm_struct *area; unsigned long off; unsigned long end_index; if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) return -EINVAL; size = PAGE_ALIGN(size); if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) return -EINVAL; area = find_vm_area(kaddr); if (!area) return -EINVAL; if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) return -EINVAL; if (check_add_overflow(size, off, &end_index) || end_index > get_vm_area_size(area)) return -EINVAL; kaddr += off; do { struct page *page = vmalloc_to_page(kaddr); int ret; ret = vm_insert_page(vma, uaddr, page); if (ret) return ret; uaddr += PAGE_SIZE; kaddr += PAGE_SIZE; size -= PAGE_SIZE; } while (size > 0); vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } /** * remap_vmalloc_range - map vmalloc pages to userspace * @vma: vma to cover (map full range of vma) * @addr: vmalloc memory * @pgoff: number of pages into addr before first page to map * * Returns: 0 for success, -Exxx on failure * * This function checks that addr is a valid vmalloc'ed area, and * that it is big enough to cover the vma. Will return failure if * that criteria isn't met. * * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) { return remap_vmalloc_range_partial(vma, vma->vm_start, addr, pgoff, vma->vm_end - vma->vm_start); } EXPORT_SYMBOL(remap_vmalloc_range); void free_vm_area(struct vm_struct *area) { struct vm_struct *ret; ret = remove_vm_area(area->addr); BUG_ON(ret != area); kfree(area); } EXPORT_SYMBOL_GPL(free_vm_area); #ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { return rb_entry_safe(n, struct vmap_area, rb_node); } /** * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to * @addr: target address * * Returns: vmap_area if it is found. If there is no such area * the first highest(reverse order) vmap_area is returned * i.e. va->va_start < addr && va->va_end < addr or NULL * if there are no any areas before @addr. */ static struct vmap_area * pvm_find_va_enclose_addr(unsigned long addr) { struct vmap_area *va, *tmp; struct rb_node *n; n = free_vmap_area_root.rb_node; va = NULL; while (n) { tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_start <= addr) { va = tmp; if (tmp->va_end >= addr) break; n = n->rb_right; } else { n = n->rb_left; } } return va; } /** * pvm_determine_end_from_reverse - find the highest aligned address * of free block below VMALLOC_END * @va: * in - the VA we start the search(reverse order); * out - the VA with the highest aligned end address. * @align: alignment for required highest address * * Returns: determined end address within vmap_area */ static unsigned long pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) { unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); unsigned long addr; if (likely(*va)) { list_for_each_entry_from_reverse((*va), &free_vmap_area_list, list) { addr = min((*va)->va_end & ~(align - 1), vmalloc_end); if ((*va)->va_start < addr) return addr; } } return 0; } /** * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator * @offsets: array containing offset of each area * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to * be scattered pretty far, distance between two areas easily going up * to gigabytes. To avoid interacting with regular vmallocs, these * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans free blocks from the end looking * for matching base. While scanning, if any of the areas do not fit the * base address is pulled down to fit the area. Scanning is repeated till * all the areas fit and then all necessary data structures are inserted * and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); struct vmap_area **vas, *va; struct vm_struct **vms; int area, area2, last_area, term_area; unsigned long base, start, size, end, last_end, orig_start, orig_end; bool purged = false; /* verify parameters and allocate data structures */ BUG_ON(offset_in_page(align) || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { start = offsets[area]; end = start + sizes[area]; /* is everything aligned properly? */ BUG_ON(!IS_ALIGNED(offsets[area], align)); BUG_ON(!IS_ALIGNED(sizes[area], align)); /* detect the area with the highest address */ if (start > offsets[last_area]) last_area = area; for (area2 = area + 1; area2 < nr_vms; area2++) { unsigned long start2 = offsets[area2]; unsigned long end2 = start2 + sizes[area2]; BUG_ON(start2 < end && start < end2); } } last_end = offsets[last_area] + sizes[last_area]; if (vmalloc_end - vmalloc_start < last_end) { WARN_ON(true); return NULL; } vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); if (!vas || !vms) goto err_free2; for (area = 0; area < nr_vms; area++) { vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } retry: spin_lock(&free_vmap_area_lock); /* start scanning - we scan from the top, begin with the last area */ area = term_area = last_area; start = offsets[area]; end = start + sizes[area]; va = pvm_find_va_enclose_addr(vmalloc_end); base = pvm_determine_end_from_reverse(&va, align) - end; while (true) { /* * base might have underflowed, add last_end before * comparing. */ if (base + last_end < vmalloc_start + last_end) goto overflow; /* * Fitting base has not been found. */ if (va == NULL) goto overflow; /* * If required width exceeds current VA block, move * base downwards and then recheck. */ if (base + end > va->va_end) { base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } /* * If this VA does not fit, move base downwards and recheck. */ if (base + start < va->va_start) { va = node_to_va(rb_prev(&va->rb_node)); base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } /* * This area fits, move on to the previous one. If * the previous one is the terminal one, we're done. */ area = (area + nr_vms - 1) % nr_vms; if (area == term_area) break; start = offsets[area]; end = start + sizes[area]; va = pvm_find_va_enclose_addr(base + end); } /* we've found a fitting base, insert all va's */ for (area = 0; area < nr_vms; area++) { int ret; start = base + offsets[area]; size = sizes[area]; va = pvm_find_va_enclose_addr(start); if (WARN_ON_ONCE(va == NULL)) /* It is a BUG(), but trigger recovery instead. */ goto recovery; ret = va_clip(&free_vmap_area_root, &free_vmap_area_list, va, start, size); if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; /* Allocated area. */ va = vas[area]; va->va_start = start; va->va_end = start + size; } spin_unlock(&free_vmap_area_lock); /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; } /* insert all vm's */ for (area = 0; area < nr_vms; area++) { struct vmap_node *vn = addr_to_node(vas[area]->va_start); spin_lock(&vn->busy.lock); insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head); setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); spin_unlock(&vn->busy.lock); } /* * Mark allocated areas as accessible. Do it now as a best-effort * approach, as they can be mapped outside of vmalloc code. * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; recovery: /* * Remove previously allocated areas. There is no * need in removing these areas from the busy tree, * because they are inserted only on the final step * and when pcpu_get_vm_areas() is success. */ while (area--) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end, KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH); vas[area] = NULL; } overflow: spin_unlock(&free_vmap_area_lock); if (!purged) { reclaim_and_purge_vmap_areas(); purged = true; /* Before "retry", check if we recover. */ for (area = 0; area < nr_vms; area++) { if (vas[area]) continue; vas[area] = kmem_cache_zalloc( vmap_area_cachep, GFP_KERNEL); if (!vas[area]) goto err_free; } goto retry; } err_free: for (area = 0; area < nr_vms; area++) { if (vas[area]) kmem_cache_free(vmap_area_cachep, vas[area]); kfree(vms[area]); } err_free2: kfree(vas); kfree(vms); return NULL; err_free_shadow: spin_lock(&free_vmap_area_lock); /* * We release all the vmalloc shadows, even the ones for regions that * hadn't been successfully added. This relies on kasan_release_vmalloc * being able to tolerate this case. */ for (area = 0; area < nr_vms; area++) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end, KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH); vas[area] = NULL; kfree(vms[area]); } spin_unlock(&free_vmap_area_lock); kfree(vas); kfree(vms); return NULL; } /** * pcpu_free_vm_areas - free vmalloc areas for percpu allocator * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() * @nr_vms: the number of allocated areas * * Free vm_structs and the array allocated by pcpu_get_vm_areas(). */ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) { int i; for (i = 0; i < nr_vms; i++) free_vm_area(vms[i]); kfree(vms); } #endif /* CONFIG_SMP */ #ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { const void *caller; struct vm_struct *vm; struct vmap_area *va; struct vmap_node *vn; unsigned long addr; unsigned int nr_pages; addr = PAGE_ALIGN((unsigned long) object); vn = addr_to_node(addr); if (!spin_trylock(&vn->busy.lock)) return false; va = __find_vmap_area(addr, &vn->busy.root); if (!va || !va->vm) { spin_unlock(&vn->busy.lock); return false; } vm = va->vm; addr = (unsigned long) vm->addr; caller = vm->caller; nr_pages = vm->nr_pages; spin_unlock(&vn->busy.lock); pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", nr_pages, addr, caller); return true; } #endif #ifdef CONFIG_PROC_FS static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; unsigned int step = 1U << vm_area_page_order(v); if (!counters) return; if (v->flags & VM_UNINITIALIZED) return; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr += step) counters[page_to_nid(v->pages[nr])] += step; for_each_node_state(nr, N_HIGH_MEMORY) if (counters[nr]) seq_printf(m, " N%u=%u", nr, counters[nr]); } } static void show_purge_info(struct seq_file *m) { struct vmap_node *vn; struct vmap_area *va; int i; for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; spin_lock(&vn->lazy.lock); list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, va_size(va)); } spin_unlock(&vn->lazy.lock); } } static int vmalloc_info_show(struct seq_file *m, void *p) { struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; int i; for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); list_for_each_entry(va, &vn->busy.head, list) { if (!va->vm) { if (va->flags & VMAP_RAM) seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, va_size(va)); continue; } v = va->vm; seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); if (v->caller) seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); if (v->phys_addr) seq_printf(m, " phys=%pa", &v->phys_addr); if (v->flags & VM_IOREMAP) seq_puts(m, " ioremap"); if (v->flags & VM_SPARSE) seq_puts(m, " sparse"); if (v->flags & VM_ALLOC) seq_puts(m, " vmalloc"); if (v->flags & VM_MAP) seq_puts(m, " vmap"); if (v->flags & VM_USERMAP) seq_puts(m, " user"); if (v->flags & VM_DMA_COHERENT) seq_puts(m, " dma-coherent"); if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); show_numa_info(m, v); seq_putc(m, '\n'); } spin_unlock(&vn->busy.lock); } /* * As a final step, dump "unpurged" areas. */ show_purge_info(m); return 0; } static int __init proc_vmalloc_init(void) { void *priv_data = NULL; if (IS_ENABLED(CONFIG_NUMA)) priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); proc_create_single_data("vmallocinfo", 0400, NULL, vmalloc_info_show, priv_data); return 0; } module_init(proc_vmalloc_init); #endif static void __init vmap_init_free_space(void) { unsigned long vmap_start = 1; const unsigned long vmap_end = ULONG_MAX; struct vmap_area *free; struct vm_struct *busy; /* * B F B B B F * -|-----|.....|-----|-----|-----|.....|- * | The KVA space | * |<--------------------------------->| */ for (busy = vmlist; busy; busy = busy->next) { if ((unsigned long) busy->addr - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; free->va_end = (unsigned long) busy->addr; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, &free_vmap_area_list); } } vmap_start = (unsigned long) busy->addr + busy->size; } if (vmap_end - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; free->va_end = vmap_end; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, &free_vmap_area_list); } } } static void vmap_init_nodes(void) { struct vmap_node *vn; int i, n; #if BITS_PER_LONG == 64 /* * A high threshold of max nodes is fixed and bound to 128, * thus a scale factor is 1 for systems where number of cores * are less or equal to specified threshold. * * As for NUMA-aware notes. For bigger systems, for example * NUMA with multi-sockets, where we can end-up with thousands * of cores in total, a "sub-numa-clustering" should be added. * * In this case a NUMA domain is considered as a single entity * with dedicated sub-nodes in it which describe one group or * set of cores. Therefore a per-domain purging is supposed to * be added as well as a per-domain balancing. */ n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); if (vn) { /* Node partition is 16 pages. */ vmap_zone_size = (1 << 4) * PAGE_SIZE; nr_vmap_nodes = n; vmap_nodes = vn; } else { pr_err("Failed to allocate an array. Disable a node layer\n"); } } #endif for (n = 0; n < nr_vmap_nodes; n++) { vn = &vmap_nodes[n]; vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); vn->lazy.root = RB_ROOT; INIT_LIST_HEAD(&vn->lazy.head); spin_lock_init(&vn->lazy.lock); for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { INIT_LIST_HEAD(&vn->pool[i].head); WRITE_ONCE(vn->pool[i].len, 0); } spin_lock_init(&vn->pool_lock); } } static unsigned long vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { unsigned long count; struct vmap_node *vn; int i, j; for (count = 0, i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; for (j = 0; j < MAX_VA_SIZE_PAGES; j++) count += READ_ONCE(vn->pool[j].len); } return count ? count : SHRINK_EMPTY; } static unsigned long vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { int i; for (i = 0; i < nr_vmap_nodes; i++) decay_va_pool_node(&vmap_nodes[i], true); return SHRINK_STOP; } void __init vmalloc_init(void) { struct shrinker *vmap_node_shrinker; struct vmap_area *va; struct vmap_node *vn; struct vm_struct *tmp; int i; /* * Create the cache for vmap_area objects. */ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); for_each_possible_cpu(i) { struct vmap_block_queue *vbq; struct vfree_deferred *p; vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); p = &per_cpu(vfree_deferred, i); init_llist_head(&p->list); INIT_WORK(&p->wq, delayed_vfree_work); xa_init(&vbq->vmap_blocks); } /* * Setup nodes before importing vmlist. */ vmap_init_nodes(); /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (WARN_ON_ONCE(!va)) continue; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; vn = addr_to_node(va->va_start); insert_vmap_area(va, &vn->busy.root, &vn->busy.head); } /* * Now we can initialize a free vmap space. */ vmap_init_free_space(); vmap_initialized = true; vmap_node_shrinker = shrinker_alloc(0, "vmap-node"); if (!vmap_node_shrinker) { pr_err("Failed to allocate vmap-node shrinker!\n"); return; } vmap_node_shrinker->count_objects = vmap_node_shrink_count; vmap_node_shrinker->scan_objects = vmap_node_shrink_scan; shrinker_register(vmap_node_shrinker); }
8 8 8 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "queueing.h" #include <linux/skb_array.h> struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr) { int cpu; struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker); if (!worker) return NULL; for_each_possible_cpu(cpu) { per_cpu_ptr(worker, cpu)->ptr = ptr; INIT_WORK(&per_cpu_ptr(worker, cpu)->work, function); } return worker; } int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, unsigned int len) { int ret; memset(queue, 0, sizeof(*queue)); queue->last_cpu = -1; ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL); if (ret) return ret; queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue); if (!queue->worker) { ptr_ring_cleanup(&queue->ring, NULL); return -ENOMEM; } return 0; } void wg_packet_queue_free(struct crypt_queue *queue, bool purge) { free_percpu(queue->worker); WARN_ON(!purge && !__ptr_ring_empty(&queue->ring)); ptr_ring_cleanup(&queue->ring, purge ? __skb_array_destroy_skb : NULL); } #define NEXT(skb) ((skb)->prev) #define STUB(queue) ((struct sk_buff *)&queue->empty) void wg_prev_queue_init(struct prev_queue *queue) { NEXT(STUB(queue)) = NULL; queue->head = queue->tail = STUB(queue); queue->peeked = NULL; atomic_set(&queue->count, 0); BUILD_BUG_ON( offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) - offsetof(struct prev_queue, empty) || offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) - offsetof(struct prev_queue, empty)); } static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) { WRITE_ONCE(NEXT(skb), NULL); WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb); } bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) { if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS)) return false; __wg_prev_queue_enqueue(queue, skb); return true; } struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue) { struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail)); if (tail == STUB(queue)) { if (!next) return NULL; queue->tail = next; tail = next; next = smp_load_acquire(&NEXT(next)); } if (next) { queue->tail = next; atomic_dec(&queue->count); return tail; } if (tail != READ_ONCE(queue->head)) return NULL; __wg_prev_queue_enqueue(queue, STUB(queue)); next = smp_load_acquire(&NEXT(tail)); if (next) { queue->tail = next; atomic_dec(&queue->count); return tail; } return NULL; } #undef NEXT #undef STUB
46 46 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 // SPDX-License-Identifier: GPL-2.0-or-later /* client.c: NFS client sharing and management code * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/xprtsock.h> #include <linux/sunrpc/xprtrdma.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> #include <linux/slab.h> #include <linux/idr.h> #include <net/ipv6.h> #include <linux/nfs_xdr.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "pnfs.h" #include "nfs.h" #include "netns.h" #include "sysfs.h" #include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_CLIENT static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); static DEFINE_RWLOCK(nfs_version_lock); static struct nfs_subversion *nfs_version_mods[5] = { [2] = NULL, [3] = NULL, [4] = NULL, }; /* * RPC cruft for NFS */ static const struct rpc_version *nfs_version[5] = { [2] = NULL, [3] = NULL, [4] = NULL, }; const struct rpc_program nfs_program = { .name = "nfs", .number = NFS_PROGRAM, .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, .pipe_dir_name = NFS_PIPE_DIRNAME, }; static struct nfs_subversion *__find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; read_lock(&nfs_version_lock); nfs = nfs_version_mods[version]; read_unlock(&nfs_version_lock); return nfs; } struct nfs_subversion *find_nfs_version(unsigned int version) { struct nfs_subversion *nfs = __find_nfs_version(version); if (!nfs && request_module("nfsv%d", version) == 0) nfs = __find_nfs_version(version); if (!nfs) return ERR_PTR(-EPROTONOSUPPORT); if (!get_nfs_version(nfs)) return ERR_PTR(-EAGAIN); return nfs; } int get_nfs_version(struct nfs_subversion *nfs) { return try_module_get(nfs->owner); } EXPORT_SYMBOL_GPL(get_nfs_version); void put_nfs_version(struct nfs_subversion *nfs) { module_put(nfs->owner); } void register_nfs_version(struct nfs_subversion *nfs) { write_lock(&nfs_version_lock); nfs_version_mods[nfs->rpc_ops->version] = nfs; nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(register_nfs_version); void unregister_nfs_version(struct nfs_subversion *nfs) { write_lock(&nfs_version_lock); nfs_version[nfs->rpc_ops->version] = NULL; nfs_version_mods[nfs->rpc_ops->version] = NULL; write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(unregister_nfs_version); /* * Allocate a shared client record * * Since these are allocated/deallocated very rarely, we don't * bother putting them in a slab cache... */ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp; int err = -ENOMEM; if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; if (!get_nfs_version(clp->cl_nfs_mod)) goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; refcount_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); clp->cl_addrlen = cl_init->addrlen; if (cl_init->hostname) { err = -ENOMEM; clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); if (!clp->cl_hostname) goto error_cleanup; } INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; clp->cl_net = get_net(cl_init->net); #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); ktime_get_real_ts64(&clp->cl_nfssvc_boot); nfs_uuid_init(&clp->cl_uuid); spin_lock_init(&clp->cl_localio_lock); #endif /* CONFIG_NFS_LOCALIO */ clp->cl_principal = "*"; clp->cl_xprtsec = cl_init->xprtsec; return clp; error_cleanup: put_nfs_version(clp->cl_nfs_mod); error_dealloc: kfree(clp); error_0: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(nfs_alloc_client); #if IS_ENABLED(CONFIG_NFS_V4) static void nfs_cleanup_cb_ident_idr(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); idr_destroy(&nn->cb_ident_idr); } /* nfs_client_lock held */ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); if (clp->cl_cb_ident) idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident); } static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); } #else static void nfs_cleanup_cb_ident_idr(struct net *net) { } static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { } static void pnfs_init_server(struct nfs_server *server) { } #endif /* CONFIG_NFS_V4 */ /* * Destroy a shared client record */ void nfs_free_client(struct nfs_client *clp) { nfs_local_disable(clp); /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); put_net(clp->cl_net); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); /* * Release a reference to a shared client record */ void nfs_put_client(struct nfs_client *clp) { struct nfs_net *nn; if (!clp) return; nn = net_generic(clp->cl_net, nfs_net_id); if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); nfs_cb_idr_remove_locked(clp); spin_unlock(&nn->nfs_client_lock); WARN_ON_ONCE(!list_empty(&clp->cl_superblocks)); clp->rpc_ops->free_client(clp); } } EXPORT_SYMBOL_GPL(nfs_put_client); /* * Find an nfs_client on the list that matches the initialisation data * that is supplied. */ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; const struct sockaddr *sap = (struct sockaddr *)data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); int error; again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; /* If a client is still initializing then we need to wait */ if (clp->cl_cons_state > NFS_CS_READY) { refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); error = nfs_wait_client_init_complete(clp); nfs_put_client(clp); spin_lock(&nn->nfs_client_lock); if (error < 0) return ERR_PTR(error); goto again; } /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; if (clp->cl_proto != data->proto) continue; /* Match nfsv4 minorversion */ if (clp->cl_minorversion != data->minorversion) continue; /* Match request for a dedicated DS */ if (test_bit(NFS_CS_DS, &data->init_flags) != test_bit(NFS_CS_DS, &clp->cl_flags)) continue; /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) /* Match all xprt_switch full socket addresses */ if (IS_ERR(clp->cl_rpcclient) || !rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient, sap)) continue; /* Match the xprt security policy */ if (clp->cl_xprtsec.policy != data->xprtsec.policy) continue; refcount_inc(&clp->cl_count); return clp; } return NULL; } /* * Return true if @clp is done initializing, false if still working on it. * * Use nfs_client_init_status to check if it was successful. */ bool nfs_client_init_is_complete(const struct nfs_client *clp) { return clp->cl_cons_state <= NFS_CS_READY; } EXPORT_SYMBOL_GPL(nfs_client_init_is_complete); /* * Return 0 if @clp was successfully initialized, -errno otherwise. * * This must be called *after* nfs_client_init_is_complete() returns true, * otherwise it will pop WARN_ON_ONCE and return -EINVAL */ int nfs_client_init_status(const struct nfs_client *clp) { /* called without checking nfs_client_init_is_complete */ if (clp->cl_cons_state > NFS_CS_READY) { WARN_ON_ONCE(1); return -EINVAL; } return clp->cl_cons_state; } EXPORT_SYMBOL_GPL(nfs_client_init_status); int nfs_wait_client_init_complete(const struct nfs_client *clp) { return wait_event_killable(nfs_client_active_wq, nfs_client_init_is_complete(clp)); } EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete); /* * Found an existing client. Make sure it's ready before returning. */ static struct nfs_client * nfs_found_client(const struct nfs_client_initdata *cl_init, struct nfs_client *clp) { int error; error = nfs_wait_client_init_complete(clp); if (error < 0) { nfs_put_client(clp); return ERR_PTR(-ERESTARTSYS); } if (clp->cl_cons_state < NFS_CS_READY) { error = clp->cl_cons_state; nfs_put_client(clp); return ERR_PTR(error); } smp_rmb(); return clp; } /* * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp, *new = NULL; struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; if (cl_init->hostname == NULL) { WARN_ON(1); return ERR_PTR(-EINVAL); } /* see if the client already exists */ do { spin_lock(&nn->nfs_client_lock); clp = nfs_match_client(cl_init); if (clp) { spin_unlock(&nn->nfs_client_lock); if (new) new->rpc_ops->free_client(new); if (IS_ERR(clp)) return clp; return nfs_found_client(cl_init, clp); } if (new) { list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new = rpc_ops->init_client(new, cl_init); if (!IS_ERR(new)) nfs_local_probe(new); return new; } spin_unlock(&nn->nfs_client_lock); new = rpc_ops->alloc_client(cl_init); } while (!IS_ERR(new)); return new; } EXPORT_SYMBOL_GPL(nfs_get_client); /* * Mark a server as ready or failed */ void nfs_mark_client_ready(struct nfs_client *clp, int state) { smp_wmb(); clp->cl_cons_state = state; wake_up_all(&nfs_client_active_wq); } EXPORT_SYMBOL_GPL(nfs_mark_client_ready); /* * Initialise the timeout values for a connection */ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans) { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; to->to_increment = to->to_initval; to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); if (to->to_maxval > NFS_MAX_TCP_TIMEOUT) to->to_maxval = NFS_MAX_TCP_TIMEOUT; if (to->to_maxval < to->to_initval) to->to_maxval = to->to_initval; to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; default: BUG(); } } EXPORT_SYMBOL_GPL(nfs_init_timeout_values); /* * Create an RPC client handle */ int nfs_create_rpc_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { .net = clp->cl_net, .protocol = clp->cl_proto, .nconnect = clp->cl_nconnect, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, .timeout = cl_init->timeparms, .servername = clp->cl_hostname, .nodename = cl_init->nodename, .program = &nfs_program, .stats = &nn->rpcstats, .version = clp->rpc_ops->version, .authflavor = flavor, .cred = cl_init->cred, .xprtsec = cl_init->xprtsec, .connect_timeout = cl_init->connect_timeout, .reconnect_timeout = cl_init->reconnect_timeout, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_DISCRTRY; if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; if (test_bit(NFS_CS_NOPING, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NOPING; if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_REUSEPORT; if (!IS_ERR(clp->cl_rpcclient)) return 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("%s: cannot create RPC client. Error = %ld\n", __func__, PTR_ERR(clnt)); return PTR_ERR(clnt); } clnt->cl_principal = clp->cl_principal; clp->cl_rpcclient = clnt; clnt->cl_max_connect = clp->cl_max_connect; return 0; } EXPORT_SYMBOL_GPL(nfs_create_rpc_client); /* * Version 2 or 3 client destruction */ static void nfs_destroy_server(struct nfs_server *server) { if (server->nlm_host) nlmclnt_done(server->nlm_host); } /* * Version 2 or 3 lockd setup */ static int nfs_start_lockd(struct nfs_server *server) { struct nlm_host *host; struct nfs_client *clp = server->nfs_client; struct nlmclnt_initdata nlm_init = { .hostname = clp->cl_hostname, .address = (struct sockaddr *)&clp->cl_addr, .addrlen = clp->cl_addrlen, .nfs_version = clp->rpc_ops->version, .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, .net = clp->cl_net, .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, .cred = server->cred, }; if (nlm_init.nfs_version > 3) return 0; if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && (server->flags & NFS_MOUNT_LOCAL_FCNTL)) return 0; switch (clp->cl_proto) { default: nlm_init.protocol = IPPROTO_TCP; break; #ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT case XPRT_TRANSPORT_UDP: nlm_init.protocol = IPPROTO_UDP; #endif } host = nlmclnt_init(&nlm_init); if (IS_ERR(host)) return PTR_ERR(host); server->nlm_host = host; server->destroy = nfs_destroy_server; nfs_sysfs_link_rpc_client(server, nlmclnt_rpc_clnt(host), NULL); return 0; } /* * Create a general RPC client */ int nfs_init_server_rpcclient(struct nfs_server *server, const struct rpc_timeout *timeo, rpc_authflavor_t pseudoflavour) { struct nfs_client *clp = server->nfs_client; server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, pseudoflavour); if (IS_ERR(server->client)) { dprintk("%s: couldn't create rpc_client!\n", __func__); return PTR_ERR(server->client); } memcpy(&server->client->cl_timeout_default, timeo, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; server->client->cl_softrtry = 0; if (server->flags & NFS_MOUNT_SOFTERR) server->client->cl_softerr = 1; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; nfs_sysfs_link_rpc_client(server, server->client, NULL); return 0; } EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); /** * nfs_init_client - Initialise an NFS2 or NFS3 client * * @clp: nfs_client to initialise * @cl_init: Initialisation parameters * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init) { int error; /* the client is already initialised */ if (clp->cl_cons_state == NFS_CS_READY) return clp; /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error); if (error < 0) { nfs_put_client(clp); clp = ERR_PTR(error); } return clp; } EXPORT_SYMBOL_GPL(nfs_init_client); /* * Create a version 2 or 3 client */ static int nfs_init_server(struct nfs_server *server, const struct fs_context *fc) { const struct nfs_fs_context *ctx = nfs_fc2context(fc); struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = ctx->nfs_server.hostname, .addr = &ctx->nfs_server._address, .addrlen = ctx->nfs_server.addrlen, .nfs_mod = ctx->nfs_mod, .proto = ctx->nfs_server.protocol, .net = fc->net_ns, .timeparms = &timeparms, .cred = server->cred, .nconnect = ctx->nfs_server.nconnect, .init_flags = (1UL << NFS_CS_REUSEPORT), .xprtsec = ctx->xprtsec, }; struct nfs_client *clp; int error; nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol, ctx->timeo, ctx->retrans); if (ctx->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) return PTR_ERR(clp); server->nfs_client = clp; nfs_sysfs_add_server(server); nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state"); /* Initialise the client representation from the mount data */ server->flags = ctx->flags; server->options = ctx->options; server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; switch (clp->rpc_ops->version) { case 2: server->fattr_valid = NFS_ATTR_FATTR_V2; break; case 3: server->fattr_valid = NFS_ATTR_FATTR_V3; break; default: server->fattr_valid = NFS_ATTR_FATTR_V4; } if (ctx->rsize) server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto); if (ctx->wsize) server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto); server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; server->acdirmin = ctx->acdirmin * HZ; server->acdirmax = ctx->acdirmax * HZ; /* Start lockd here, before we might error out */ error = nfs_start_lockd(server); if (error < 0) goto error; server->port = ctx->nfs_server.port; server->auth_info = ctx->auth_info; error = nfs_init_server_rpcclient(server, &timeparms, ctx->selected_flavor); if (error < 0) goto error; /* Preserve the values of mount_server-related mount options */ if (ctx->mount_server.addrlen) { memcpy(&server->mountd_address, &ctx->mount_server.address, ctx->mount_server.addrlen); server->mountd_addrlen = ctx->mount_server.addrlen; } server->mountd_version = ctx->mount_server.version; server->mountd_port = ctx->mount_server.port; server->mountd_protocol = ctx->mount_server.protocol; server->namelen = ctx->namlen; return 0; error: server->nfs_client = NULL; nfs_put_client(clp); return error; } /* * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) { struct nfs_client *clp = server->nfs_client; unsigned long max_rpc_payload, raw_max_rpc_payload; /* Work out a lot of parameters */ if (server->rsize == 0) server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto); if (server->wsize == 0) server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto); if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto); if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto); raw_max_rpc_payload = rpc_max_payload(server->client); max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL); if (server->rsize > max_rpc_payload) server->rsize = max_rpc_payload; if (server->rsize > NFS_MAX_FILE_IO_SIZE) server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); if (server->dtsize > NFS_MAX_FILE_IO_SIZE) server->dtsize = NFS_MAX_FILE_IO_SIZE; if (server->dtsize > server->rsize) server->dtsize = server->rsize; if (server->flags & NFS_MOUNT_NOAC) { server->acregmin = server->acregmax = 0; server->acdirmin = server->acdirmax = 0; } server->maxfilesize = fsinfo->maxfilesize; server->time_delta = fsinfo->time_delta; server->change_attr_type = fsinfo->change_attr_type; server->clone_blksize = fsinfo->clone_blksize; /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); #ifdef CONFIG_NFS_V4_2 /* * Defaults until limited by the session parameters. */ server->gxasize = min_t(unsigned int, raw_max_rpc_payload, XATTR_SIZE_MAX); server->sxasize = min_t(unsigned int, raw_max_rpc_payload, XATTR_SIZE_MAX); server->lxasize = min_t(unsigned int, raw_max_rpc_payload, nfs42_listxattr_xdrsize(XATTR_LIST_MAX)); if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; #endif } /* * Probe filesystem information, including the FSID on v2/v3 */ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) { struct nfs_fsinfo fsinfo; struct nfs_client *clp = server->nfs_client; int error; if (clp->rpc_ops->set_capabilities != NULL) { error = clp->rpc_ops->set_capabilities(server, mntfh); if (error < 0) return error; } fsinfo.fattr = fattr; fsinfo.nlayouttypes = 0; memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) return error; nfs_server_set_fsinfo(server, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { struct nfs_pathconf pathinfo; pathinfo.fattr = fattr; nfs_fattr_init(fattr); if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0) server->namelen = pathinfo.max_namelen; } if (clp->rpc_ops->discover_trunking != NULL && (server->caps & NFS_CAP_FS_LOCATIONS && (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) { error = clp->rpc_ops->discover_trunking(server, mntfh); if (error < 0) return error; } return 0; } /* * Grab the destination's particulars, including lease expiry time. * * Returns zero if probe succeeded and retrieved FSID matches the FSID * we have cached. */ int nfs_probe_server(struct nfs_server *server, struct nfs_fh *mntfh) { struct nfs_fattr *fattr; int error; fattr = nfs_alloc_fattr(); if (fattr == NULL) return -ENOMEM; /* Sanity: the probe won't work if the destination server * does not recognize the migrated FH. */ error = nfs_probe_fsinfo(server, mntfh, fattr); nfs_free_fattr(fattr); return error; } EXPORT_SYMBOL_GPL(nfs_probe_server); /* * Copy useful information when duplicating a server record */ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) { target->flags = source->flags; target->rsize = source->rsize; target->wsize = source->wsize; target->acregmin = source->acregmin; target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; target->acdirmax = source->acdirmax; target->caps = source->caps; target->options = source->options; target->auth_info = source->auth_info; target->port = source->port; } EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); void nfs_server_insert_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); list_add_tail(&server->master_link, &nn->nfs_volume_list); clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); spin_unlock(&nn->nfs_client_lock); } EXPORT_SYMBOL_GPL(nfs_server_insert_lists); void nfs_server_remove_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn; if (clp == NULL) return; nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_del_rcu(&server->client_link); if (list_empty(&clp->cl_superblocks)) set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); list_del(&server->master_link); spin_unlock(&nn->nfs_client_lock); synchronize_rcu(); } EXPORT_SYMBOL_GPL(nfs_server_remove_lists); static DEFINE_IDA(s_sysfs_ids); /* * Allocate and initialise a server record */ struct nfs_server *nfs_alloc_server(void) { struct nfs_server *server; server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); if (!server) return NULL; server->s_sysfs_id = ida_alloc(&s_sysfs_ids, GFP_KERNEL); if (server->s_sysfs_id < 0) { kfree(server); return NULL; } server->client = server->client_acl = ERR_PTR(-EINVAL); /* Zero out the NFS state stuff */ INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); INIT_LIST_HEAD(&server->delegations); INIT_LIST_HEAD(&server->layouts); INIT_LIST_HEAD(&server->state_owners_lru); INIT_LIST_HEAD(&server->ss_copies); INIT_LIST_HEAD(&server->ss_src_copies); atomic_set(&server->active, 0); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { kfree(server); return NULL; } server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; init_waitqueue_head(&server->write_congestion_wait); atomic_long_set(&server->writeback, 0); atomic64_set(&server->owner_ctr, 0); pnfs_init_server(server); rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); return server; } EXPORT_SYMBOL_GPL(nfs_alloc_server); static void delayed_free(struct rcu_head *p) { struct nfs_server *server = container_of(p, struct nfs_server, rcu); nfs_free_iostats(server->io_stats); kfree(server); } /* * Free up a server record */ void nfs_free_server(struct nfs_server *server) { nfs_server_remove_lists(server); if (server->destroy != NULL) server->destroy(server); if (!IS_ERR(server->client_acl)) rpc_shutdown_client(server->client_acl); if (!IS_ERR(server->client)) rpc_shutdown_client(server->client); nfs_put_client(server->nfs_client); if (server->kobj.state_initialized) { nfs_sysfs_remove_server(server); kobject_put(&server->kobj); } ida_free(&s_sysfs_ids, server->s_sysfs_id); put_cred(server->cred); nfs_release_automount_timer(); call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); /* * Create a version 2 or 3 volume record * - keyed on server and FSID */ struct nfs_server *nfs_create_server(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_server *server; struct nfs_fattr *fattr; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); server->cred = get_cred(fc->cred); error = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) goto error; /* Get a client representation */ error = nfs_init_server(server, fc); if (error < 0) goto error; /* Probe the root fh to retrieve its FSID */ error = nfs_probe_fsinfo(server, ctx->mntfh, fattr); if (error < 0) goto error; if (server->nfs_client->rpc_ops->version == 3) { if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) server->namelen = NFS3_MAXNAMLEN; if (!(ctx->flags & NFS_MOUNT_NORDIRPLUS)) server->caps |= NFS_CAP_READDIRPLUS; } else { if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, fattr, NULL); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; } } memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); nfs_server_insert_lists(server); server->mount_time = jiffies; nfs_free_fattr(fattr); return server; error: nfs_free_fattr(fattr); nfs_free_server(server); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_create_server); /* * Clone an NFS2, NFS3 or NFS4 server record */ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fh *fh, struct nfs_fattr *fattr, rpc_authflavor_t flavor) { struct nfs_server *server; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); server->cred = get_cred(source->cred); /* Copy data from the source */ server->nfs_client = source->nfs_client; server->destroy = source->destroy; refcount_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); server->fsid = fattr->fsid; nfs_sysfs_add_server(server); nfs_sysfs_link_rpc_client(server, server->nfs_client->cl_rpcclient, "_state"); error = nfs_init_server_rpcclient(server, source->client->cl_timeout, flavor); if (error < 0) goto out_free_server; /* probe the filesystem info for this server filesystem */ error = nfs_probe_server(server, fh); if (error < 0) goto out_free_server; if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; error = nfs_start_lockd(server); if (error < 0) goto out_free_server; nfs_server_insert_lists(server); server->mount_time = jiffies; return server; out_free_server: nfs_free_server(server); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_clone_server); void nfs_clients_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); INIT_LIST_HEAD(&nn->nfs_client_list); INIT_LIST_HEAD(&nn->nfs_volume_list); #if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); #endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); nn->rpcstats.program = &nfs_program; nfs_netns_sysfs_setup(nn, net); } void nfs_clients_exit(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); nfs_netns_sysfs_destroy(nn); nfs_cleanup_cb_ident_idr(net); WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); } #ifdef CONFIG_PROC_FS static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); static int nfs_server_list_show(struct seq_file *m, void *v); static const struct seq_operations nfs_server_list_ops = { .start = nfs_server_list_start, .next = nfs_server_list_next, .stop = nfs_server_list_stop, .show = nfs_server_list_show, }; static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); static int nfs_volume_list_show(struct seq_file *m, void *v); static const struct seq_operations nfs_volume_list_ops = { .start = nfs_volume_list_start, .next = nfs_volume_list_next, .stop = nfs_volume_list_stop, .show = nfs_volume_list_show, }; /* * set up the iterator to start reading from the server list and return the first item */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); return seq_list_start_head(&nn->nfs_client_list, *_pos); } /* * move to next server */ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_client_list, pos); } /* * clean up after reading from the transports list */ static void nfs_server_list_stop(struct seq_file *p, void *v) __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } /* * display a header line followed by a load of call lines */ static int nfs_server_list_show(struct seq_file *m, void *v) { struct nfs_client *clp; struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_client_list) { seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); return 0; } /* display one transport per line on subsequent lines */ clp = list_entry(v, struct nfs_client, cl_share_link); /* Check if the client is initialized */ if (clp->cl_cons_state != NFS_CS_READY) return 0; rcu_read_lock(); seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), refcount_read(&clp->cl_count), clp->cl_hostname); rcu_read_unlock(); return 0; } /* * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); return seq_list_start_head(&nn->nfs_volume_list, *_pos); } /* * move to next volume */ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_volume_list, pos); } /* * clean up after reading from the transports list */ static void nfs_volume_list_stop(struct seq_file *p, void *v) __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } /* * display a header line followed by a load of call lines */ static int nfs_volume_list_show(struct seq_file *m, void *v) { struct nfs_server *server; struct nfs_client *clp; char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0' char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0' struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { seq_puts(m, "NV SERVER PORT DEV FSID" " FSC\n"); return 0; } /* display one transport per line on subsequent lines */ server = list_entry(v, struct nfs_server, master_link); clp = server->nfs_client; snprintf(dev, sizeof(dev), "%u:%u", MAJOR(server->s_dev), MINOR(server->s_dev)); snprintf(fsid, sizeof(fsid), "%llx:%llx", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); rcu_read_lock(); seq_printf(m, "v%u %s %s %-12s %-33s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), dev, fsid, nfs_server_fscache_state(server)); rcu_read_unlock(); return 0; } int nfs_fs_proc_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); struct proc_dir_entry *p; nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net); if (!nn->proc_nfsfs) goto error_0; /* a file of servers with which we're dealing */ p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_server_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; /* a file of volumes that we have mounted */ p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_volume_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; return 0; error_1: remove_proc_subtree("nfsfs", net->proc_net); error_0: return -ENOMEM; } void nfs_fs_proc_net_exit(struct net *net) { remove_proc_subtree("nfsfs", net->proc_net); } /* * initialise the /proc/fs/nfsfs/ directory */ int __init nfs_fs_proc_init(void) { if (!proc_mkdir("fs/nfsfs", NULL)) goto error_0; /* a file of servers with which we're dealing */ if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers")) goto error_1; /* a file of volumes that we have mounted */ if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes")) goto error_1; return 0; error_1: remove_proc_subtree("fs/nfsfs", NULL); error_0: return -ENOMEM; } /* * clean up the /proc/fs/nfsfs/ directory */ void nfs_fs_proc_exit(void) { remove_proc_subtree("fs/nfsfs", NULL); ida_destroy(&s_sysfs_ids); } #endif /* CONFIG_PROC_FS */
46 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 // SPDX-License-Identifier: GPL-2.0-only /* * Syscall interface to knfsd. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/slab.h> #include <linux/namei.h> #include <linux/ctype.h> #include <linux/fs_context.h> #include <linux/sunrpc/svcsock.h> #include <linux/lockd/lockd.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/svc.h> #include <linux/module.h> #include <linux/fsnotify.h> #include <linux/nfslocalio.h> #include "idmap.h" #include "nfsd.h" #include "cache.h" #include "state.h" #include "netns.h" #include "pnfs.h" #include "filecache.h" #include "trace.h" #include "netlink.h" /* * We have a single directory with several nodes in it. */ enum { NFSD_Root = 1, NFSD_List, NFSD_Export_Stats, NFSD_Export_features, NFSD_Fh, NFSD_FO_UnlockIP, NFSD_FO_UnlockFS, NFSD_Threads, NFSD_Pool_Threads, NFSD_Pool_Stats, NFSD_Reply_Cache_Stats, NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, NFSD_MaxConnections, NFSD_Filecache, NFSD_Leasetime, NFSD_Gracetime, NFSD_RecoveryDir, NFSD_V4EndGrace, NFSD_MaxReserved }; /* * write() for these nodes. */ static ssize_t write_filehandle(struct file *file, char *buf, size_t size); static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); static ssize_t write_threads(struct file *file, char *buf, size_t size); static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); static ssize_t write_versions(struct file *file, char *buf, size_t size); static ssize_t write_ports(struct file *file, char *buf, size_t size); static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); static ssize_t write_maxconn(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_V4 static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_gracetime(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); #endif static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size); #endif static ssize_t (*const write_op[])(struct file *, char *, size_t) = { [NFSD_Fh] = write_filehandle, [NFSD_FO_UnlockIP] = write_unlock_ip, [NFSD_FO_UnlockFS] = write_unlock_fs, [NFSD_Threads] = write_threads, [NFSD_Pool_Threads] = write_pool_threads, [NFSD_Versions] = write_versions, [NFSD_Ports] = write_ports, [NFSD_MaxBlkSize] = write_maxblksize, [NFSD_MaxConnections] = write_maxconn, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = write_leasetime, [NFSD_Gracetime] = write_gracetime, #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING [NFSD_RecoveryDir] = write_recoverydir, #endif [NFSD_V4EndGrace] = write_v4_end_grace, #endif }; static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) { ino_t ino = file_inode(file)->i_ino; char *data; ssize_t rv; if (ino >= ARRAY_SIZE(write_op) || !write_op[ino]) return -EINVAL; data = simple_transaction_get(file, buf, size); if (IS_ERR(data)) return PTR_ERR(data); rv = write_op[ino](file, data, size); if (rv < 0) return rv; simple_transaction_set(file, rv); return size; } static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { if (! file->private_data) { /* An attempt to read a transaction file without writing * causes a 0-byte write so that the file can return * state information */ ssize_t rv = nfsctl_transaction_write(file, buf, 0, pos); if (rv < 0) return rv; } return simple_transaction_read(file, buf, size, pos); } static const struct file_operations transaction_ops = { .write = nfsctl_transaction_write, .read = nfsctl_transaction_read, .release = simple_transaction_release, .llseek = default_llseek, }; static int exports_net_open(struct net *net, struct file *file) { int err; struct seq_file *seq; struct nfsd_net *nn = net_generic(net, nfsd_net_id); err = seq_open(file, &nfs_exports_op); if (err) return err; seq = file->private_data; seq->private = nn->svc_export_cache; return 0; } static int exports_nfsd_open(struct inode *inode, struct file *file) { return exports_net_open(inode->i_sb->s_fs_info, file); } static const struct file_operations exports_nfsd_operations = { .open = exports_nfsd_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static int export_features_show(struct seq_file *m, void *v) { seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS); return 0; } DEFINE_SHOW_ATTRIBUTE(export_features); static int nfsd_pool_stats_open(struct inode *inode, struct file *file) { struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); return svc_pool_stats_open(&nn->nfsd_info, file); } static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats); DEFINE_SHOW_ATTRIBUTE(nfsd_file_cache_stats); /*----------------------------------------------------------------------------*/ /* * payload - write methods */ static inline struct net *netns(struct file *file) { return file_inode(file)->i_sb->s_fs_info; } /* * write_unlock_ip - Release all locks used by a client * * Experimental. * * Input: * buf: '\n'-terminated C string containing a * presentation format IP address * size: length of C string in @buf * Output: * On success: returns zero if all specified locks were released; * returns one if one or more locks were not released * On error: return code is negative errno value */ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) { struct sockaddr_storage address; struct sockaddr *sap = (struct sockaddr *)&address; size_t salen = sizeof(address); char *fo_path; struct net *net = netns(file); /* sanity check */ if (size == 0) return -EINVAL; if (buf[size-1] != '\n') return -EINVAL; fo_path = buf; if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; if (rpc_pton(net, fo_path, size, sap, salen) == 0) return -EINVAL; trace_nfsd_ctl_unlock_ip(net, buf); return nlmsvc_unlock_all_by_ip(sap); } /* * write_unlock_fs - Release all locks on a local file system * * Experimental. * * Input: * buf: '\n'-terminated C string containing the * absolute pathname of a local file system * size: length of C string in @buf * Output: * On success: returns zero if all specified locks were released; * returns one if one or more locks were not released * On error: return code is negative errno value */ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) { struct path path; char *fo_path; int error; /* sanity check */ if (size == 0) return -EINVAL; if (buf[size-1] != '\n') return -EINVAL; fo_path = buf; if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; trace_nfsd_ctl_unlock_fs(netns(file), fo_path); error = kern_path(fo_path, 0, &path); if (error) return error; /* * XXX: Needs better sanity checking. Otherwise we could end up * releasing locks on the wrong file system. * * For example: * 1. Does the path refer to a directory? * 2. Is that directory a mount point, or * 3. Is that directory the root of an exported file system? */ error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); nfsd4_revoke_states(netns(file), path.dentry->d_sb); path_put(&path); return error; } /* * write_filehandle - Get a variable-length NFS file handle by path * * On input, the buffer contains a '\n'-terminated C string comprised of * three alphanumeric words separated by whitespace. The string may * contain escape sequences. * * Input: * buf: * domain: client domain name * path: export pathname * maxsize: numeric maximum size of * @buf * size: length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C * string containing a ASCII hex text version * of the NFS file handle; * return code is the size in bytes of the string * On error: return code is negative errno value */ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) { char *dname, *path; int maxsize; char *mesg = buf; int len; struct auth_domain *dom; struct knfsd_fh fh; if (size == 0) return -EINVAL; if (buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; dname = mesg; len = qword_get(&mesg, dname, size); if (len <= 0) return -EINVAL; path = dname+len+1; len = qword_get(&mesg, path, size); if (len <= 0) return -EINVAL; len = get_int(&mesg, &maxsize); if (len) return len; if (maxsize < NFS_FHSIZE) return -EINVAL; maxsize = min(maxsize, NFS3_FHSIZE); if (qword_get(&mesg, mesg, size) > 0) return -EINVAL; trace_nfsd_ctl_filehandle(netns(file), dname, path, maxsize); /* we have all the words, they are in buf.. */ dom = unix_domain_find(dname); if (!dom) return -ENOMEM; len = exp_rootfh(netns(file), dom, path, &fh, maxsize); auth_domain_put(dom); if (len) return len; mesg = buf; len = SIMPLE_TRANSACTION_LIMIT; qword_addhex(&mesg, &len, fh.fh_raw, fh.fh_size); mesg[-1] = '\n'; return mesg - buf; } /* * write_threads - Start NFSD, or report the current number of running threads * * Input: * buf: ignored * size: zero * Output: * On success: passed-in buffer filled with '\n'-terminated C * string numeric value representing the number of * running NFSD threads; * return code is the size in bytes of the string * On error: return code is zero * * OR * * Input: * buf: C string containing an unsigned * integer value representing the * number of NFSD threads to start * size: non-zero length of C string in @buf * Output: * On success: NFS service is started; * passed-in buffer filled with '\n'-terminated C * string numeric value representing the number of * running NFSD threads; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_threads(struct file *file, char *buf, size_t size) { char *mesg = buf; int rv; struct net *net = netns(file); if (size > 0) { int newthreads; rv = get_int(&mesg, &newthreads); if (rv) return rv; if (newthreads < 0) return -EINVAL; trace_nfsd_ctl_threads(net, newthreads); mutex_lock(&nfsd_mutex); rv = nfsd_svc(1, &newthreads, net, file->f_cred, NULL); mutex_unlock(&nfsd_mutex); if (rv < 0) return rv; } else rv = nfsd_nrthreads(net); return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv); } /* * write_pool_threads - Set or report the current number of threads per pool * * Input: * buf: ignored * size: zero * * OR * * Input: * buf: C string containing whitespace- * separated unsigned integer values * representing the number of NFSD * threads to start in each pool * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C * string containing integer values representing the * number of NFSD threads in each pool; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) { /* if size > 0, look for an array of number of threads per node * and apply them then write out number of threads per node as reply */ char *mesg = buf; int i; int rv; int len; int npools; int *nthreads; struct net *net = netns(file); mutex_lock(&nfsd_mutex); npools = nfsd_nrpools(net); if (npools == 0) { /* * NFS is shut down. The admin can start it by * writing to the threads file but NOT the pool_threads * file, sorry. Report zero threads. */ mutex_unlock(&nfsd_mutex); strcpy(buf, "0\n"); return strlen(buf); } nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL); rv = -ENOMEM; if (nthreads == NULL) goto out_free; if (size > 0) { for (i = 0; i < npools; i++) { rv = get_int(&mesg, &nthreads[i]); if (rv == -ENOENT) break; /* fewer numbers than pools */ if (rv) goto out_free; /* syntax error */ rv = -EINVAL; if (nthreads[i] < 0) goto out_free; trace_nfsd_ctl_pool_threads(net, i, nthreads[i]); } /* * There must always be a thread in pool 0; the admin * can't shut down NFS completely using pool_threads. */ if (nthreads[0] == 0) nthreads[0] = 1; rv = nfsd_set_nrthreads(i, nthreads, net); if (rv) goto out_free; } rv = nfsd_get_nrthreads(npools, nthreads, net); if (rv) goto out_free; mesg = buf; size = SIMPLE_TRANSACTION_LIMIT; for (i = 0; i < npools && size > 0; i++) { snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' ')); len = strlen(mesg); size -= len; mesg += len; } rv = mesg - buf; out_free: kfree(nthreads); mutex_unlock(&nfsd_mutex); return rv; } static ssize_t nfsd_print_version_support(struct nfsd_net *nn, char *buf, int remaining, const char *sep, unsigned vers, int minor) { const char *format = minor < 0 ? "%s%c%u" : "%s%c%u.%u"; bool supported = !!nfsd_vers(nn, vers, NFSD_TEST); if (vers == 4 && minor >= 0 && !nfsd_minorversion(nn, minor, NFSD_TEST)) supported = false; if (minor == 0 && supported) /* * special case for backward compatability. * +4.0 is never reported, it is implied by * +4, unless -4.0 is present. */ return 0; return snprintf(buf, remaining, format, sep, supported ? '+' : '-', vers, minor); } static ssize_t __write_versions(struct file *file, char *buf, size_t size) { char *mesg = buf; char *vers, *minorp, sign; int len, num, remaining; ssize_t tlen = 0; char *sep; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); if (size > 0) { if (nn->nfsd_serv) /* Cannot change versions without updating * nn->nfsd_serv->sv_xdrsize, and reallocing * rq_argp and rq_resp */ return -EBUSY; if (buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; trace_nfsd_ctl_version(netns(file), buf); vers = mesg; len = qword_get(&mesg, vers, size); if (len <= 0) return -EINVAL; do { enum vers_op cmd; unsigned minor; sign = *vers; if (sign == '+' || sign == '-') num = simple_strtol((vers+1), &minorp, 0); else num = simple_strtol(vers, &minorp, 0); if (*minorp == '.') { if (num != 4) return -EINVAL; if (kstrtouint(minorp+1, 0, &minor) < 0) return -EINVAL; } cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; switch(num) { #ifdef CONFIG_NFSD_V2 case 2: #endif case 3: nfsd_vers(nn, num, cmd); break; case 4: if (*minorp == '.') { if (nfsd_minorversion(nn, minor, cmd) < 0) return -EINVAL; } else if ((cmd == NFSD_SET) != nfsd_vers(nn, num, NFSD_TEST)) { /* * Either we have +4 and no minors are enabled, * or we have -4 and at least one minor is enabled. * In either case, propagate 'cmd' to all minors. */ minor = 0; while (nfsd_minorversion(nn, minor, cmd) >= 0) minor++; } break; default: /* Ignore requests to disable non-existent versions */ if (cmd == NFSD_SET) return -EINVAL; } vers += len + 1; } while ((len = qword_get(&mesg, vers, size)) > 0); /* If all get turned off, turn them back on, as * having no versions is BAD */ nfsd_reset_versions(nn); } /* Now write current state into reply buffer */ sep = ""; remaining = SIMPLE_TRANSACTION_LIMIT; for (num=2 ; num <= 4 ; num++) { int minor; if (!nfsd_vers(nn, num, NFSD_AVAIL)) continue; minor = -1; do { len = nfsd_print_version_support(nn, buf, remaining, sep, num, minor); if (len >= remaining) goto out; remaining -= len; buf += len; tlen += len; minor++; if (len) sep = " "; } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION); } out: len = snprintf(buf, remaining, "\n"); if (len >= remaining) return -EINVAL; return tlen + len; } /* * write_versions - Set or report the available NFS protocol versions * * Input: * buf: ignored * size: zero * Output: * On success: passed-in buffer filled with '\n'-terminated C * string containing positive or negative integer * values representing the current status of each * protocol version; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value * * OR * * Input: * buf: C string containing whitespace- * separated positive or negative * integer values representing NFS * protocol versions to enable ("+n") * or disable ("-n") * size: non-zero length of C string in @buf * Output: * On success: status of zero or more protocol versions has * been updated; passed-in buffer filled with * '\n'-terminated C string containing positive * or negative integer values representing the * current status of each protocol version; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_versions(struct file *file, char *buf, size_t size) { ssize_t rv; mutex_lock(&nfsd_mutex); rv = __write_versions(file, buf, size); mutex_unlock(&nfsd_mutex); return rv; } /* * Zero-length write. Return a list of NFSD's current listener * transports. */ static ssize_t __write_ports_names(char *buf, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (nn->nfsd_serv == NULL) return 0; return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT); } /* * A single 'fd' number was written, in which case it must be for * a socket of a supported family/protocol, and we use it as an * nfsd listener. */ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred *cred) { char *mesg = buf; int fd, err; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; err = get_int(&mesg, &fd); if (err != 0 || fd < 0) return -EINVAL; trace_nfsd_ctl_ports_addfd(net, fd); err = nfsd_create_serv(net); if (err != 0) return err; serv = nn->nfsd_serv; err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) nfsd_destroy_serv(net); return err; } /* * A transport listener is added by writing its transport name and * a port number. */ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred) { char transport[16]; struct svc_xprt *xprt; int port, err; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; if (sscanf(buf, "%15s %5u", transport, &port) != 2) return -EINVAL; if (port < 1 || port > USHRT_MAX) return -EINVAL; trace_nfsd_ctl_ports_addxprt(net, transport, port); err = nfsd_create_serv(net); if (err != 0) return err; serv = nn->nfsd_serv; err = svc_xprt_create(serv, transport, net, PF_INET, port, SVC_SOCK_ANONYMOUS, cred); if (err < 0) goto out_err; err = svc_xprt_create(serv, transport, net, PF_INET6, port, SVC_SOCK_ANONYMOUS, cred); if (err < 0 && err != -EAFNOSUPPORT) goto out_close; return 0; out_close: xprt = svc_find_xprt(serv, transport, net, PF_INET, port); if (xprt != NULL) { svc_xprt_close(xprt); svc_xprt_put(xprt); } out_err: if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) nfsd_destroy_serv(net); return err; } static ssize_t __write_ports(struct file *file, char *buf, size_t size, struct net *net) { if (size == 0) return __write_ports_names(buf, net); if (isdigit(buf[0])) return __write_ports_addfd(buf, net, file->f_cred); if (isalpha(buf[0])) return __write_ports_addxprt(buf, net, file->f_cred); return -EINVAL; } /* * write_ports - Pass a socket file descriptor or transport name to listen on * * Input: * buf: ignored * size: zero * Output: * On success: passed-in buffer filled with a '\n'-terminated C * string containing a whitespace-separated list of * named NFSD listeners; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value * * OR * * Input: * buf: C string containing an unsigned * integer value representing a bound * but unconnected socket that is to be * used as an NFSD listener; listen(3) * must be called for a SOCK_STREAM * socket, otherwise it is ignored * size: non-zero length of C string in @buf * Output: * On success: NFS service is started; * passed-in buffer filled with a '\n'-terminated C * string containing a unique alphanumeric name of * the listener; * return code is the size in bytes of the string * On error: return code is a negative errno value * * OR * * Input: * buf: C string containing a transport * name and an unsigned integer value * representing the port to listen on, * separated by whitespace * size: non-zero length of C string in @buf * Output: * On success: returns zero; NFS service is started * On error: return code is a negative errno value */ static ssize_t write_ports(struct file *file, char *buf, size_t size) { ssize_t rv; mutex_lock(&nfsd_mutex); rv = __write_ports(file, buf, size, netns(file)); mutex_unlock(&nfsd_mutex); return rv; } int nfsd_max_blksize; /* * write_maxblksize - Set or report the current NFS blksize * * Input: * buf: ignored * size: zero * * OR * * Input: * buf: C string containing an unsigned * integer value representing the new * NFS blksize * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string * containing numeric value of the current NFS blksize * setting; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) { char *mesg = buf; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); if (size > 0) { int bsize; int rv = get_int(&mesg, &bsize); if (rv) return rv; trace_nfsd_ctl_maxblksize(netns(file), bsize); /* force bsize into allowed range and * required alignment. */ bsize = max_t(int, bsize, 1024); bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE); bsize &= ~(1024-1); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv) { mutex_unlock(&nfsd_mutex); return -EBUSY; } nfsd_max_blksize = bsize; mutex_unlock(&nfsd_mutex); } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", nfsd_max_blksize); } /* * write_maxconn - Set or report the current max number of connections * * Input: * buf: ignored * size: zero * OR * * Input: * buf: C string containing an unsigned * integer value representing the new * number of max connections * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string * containing numeric value of max_connections setting * for this net namespace; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_maxconn(struct file *file, char *buf, size_t size) { char *mesg = buf; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); unsigned int maxconn = nn->max_connections; if (size > 0) { int rv = get_uint(&mesg, &maxconn); if (rv) return rv; trace_nfsd_ctl_maxconn(netns(file), maxconn); nn->max_connections = maxconn; } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn); } #ifdef CONFIG_NFSD_V4 static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time64_t *time, struct nfsd_net *nn) { struct dentry *dentry = file_dentry(file); char *mesg = buf; int rv, i; if (size > 0) { if (nn->nfsd_serv) return -EBUSY; rv = get_int(&mesg, &i); if (rv) return rv; trace_nfsd_ctl_time(netns(file), dentry->d_name.name, dentry->d_name.len, i); /* * Some sanity checking. We don't have a reason for * these particular numbers, but problems with the * extremes are: * - Too short: the briefest network outage may * cause clients to lose all their locks. Also, * the frequent polling may be wasteful. * - Too long: do you really want reboot recovery * to take more than an hour? Or to make other * clients wait an hour before being able to * revoke a dead client's locks? */ if (i < 10 || i > 3600) return -EINVAL; *time = i; } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%lld\n", *time); } static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time64_t *time, struct nfsd_net *nn) { ssize_t rv; mutex_lock(&nfsd_mutex); rv = __nfsd4_write_time(file, buf, size, time, nn); mutex_unlock(&nfsd_mutex); return rv; } /* * write_leasetime - Set or report the current NFSv4 lease time * * Input: * buf: ignored * size: zero * * OR * * Input: * buf: C string containing an unsigned * integer value representing the new * NFSv4 lease expiry time * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C * string containing unsigned integer value of the * current lease expiry time; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) { struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn); } /* * write_gracetime - Set or report current NFSv4 grace period time * * As above, but sets the time of the NFSv4 grace period. * * Note this should never be set to less than the *previous* * lease-period time, but we don't try to enforce this. (In the common * case (a new boot), we don't know what the previous lease time was * anyway.) */ static ssize_t write_gracetime(struct file *file, char *buf, size_t size) { struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); } #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, struct nfsd_net *nn) { char *mesg = buf; char *recdir; int len, status; if (size > 0) { if (nn->nfsd_serv) return -EBUSY; if (size > PATH_MAX || buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; recdir = mesg; len = qword_get(&mesg, recdir, size); if (len <= 0) return -EINVAL; trace_nfsd_ctl_recoverydir(netns(file), recdir); status = nfs4_reset_recoverydir(recdir); if (status) return status; } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n", nfs4_recoverydir()); } /* * write_recoverydir - Set or report the pathname of the recovery directory * * Input: * buf: ignored * size: zero * * OR * * Input: * buf: C string containing the pathname * of the directory on a local file * system containing permanent NFSv4 * recovery data * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string * containing the current recovery pathname setting; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) { ssize_t rv; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); mutex_lock(&nfsd_mutex); rv = __write_recoverydir(file, buf, size, nn); mutex_unlock(&nfsd_mutex); return rv; } #endif /* * write_v4_end_grace - release grace period for nfsd's v4.x lock manager * * Input: * buf: ignored * size: zero * OR * * Input: * buf: any value * size: non-zero length of C string in @buf * Output: * passed-in buffer filled with "Y" or "N" with a newline * and NULL-terminated C string. This indicates whether * the grace period has ended in the current net * namespace. Return code is the size in bytes of the * string. Writing a string that starts with 'Y', 'y', or * '1' to the file will end the grace period for nfsd's v4 * lock manager. */ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) { struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); if (size > 0) { switch(buf[0]) { case 'Y': case 'y': case '1': if (!nn->nfsd_serv) return -EBUSY; trace_nfsd_end_grace(netns(file)); nfsd4_end_grace(nn); break; default: return -EINVAL; } } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n", nn->grace_ended ? 'Y' : 'N'); } #endif /*----------------------------------------------------------------------------*/ /* * populating the filesystem. */ /* Basically copying rpc_get_inode. */ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); if (!inode) return NULL; /* Following advice from simple_fill_super documentation: */ inode->i_ino = iunique(sb, NFSD_MaxReserved); inode->i_mode = mode; simple_inode_init_ts(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_fop = &simple_dir_operations; inode->i_op = &simple_dir_inode_operations; inc_nlink(inode); break; case S_IFLNK: inode->i_op = &simple_symlink_inode_operations; break; default: break; } return inode; } static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, struct nfsdfs_client *ncl) { struct inode *inode; inode = nfsd_get_inode(dir->i_sb, mode); if (!inode) return -ENOMEM; if (ncl) { inode->i_private = ncl; kref_get(&ncl->cl_ref); } d_add(dentry, inode); inc_nlink(dir); fsnotify_mkdir(dir, dentry); return 0; } static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name) { struct inode *dir = parent->d_inode; struct dentry *dentry; int ret = -ENOMEM; inode_lock(dir); dentry = d_alloc_name(parent, name); if (!dentry) goto out_err; ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600, ncl); if (ret) goto out_err; out: inode_unlock(dir); return dentry; out_err: dput(dentry); dentry = ERR_PTR(ret); goto out; } #if IS_ENABLED(CONFIG_SUNRPC_GSS) static int __nfsd_symlink(struct inode *dir, struct dentry *dentry, umode_t mode, const char *content) { struct inode *inode; inode = nfsd_get_inode(dir->i_sb, mode); if (!inode) return -ENOMEM; inode->i_link = (char *)content; inode->i_size = strlen(content); d_add(dentry, inode); inc_nlink(dir); fsnotify_create(dir, dentry); return 0; } /* * @content is assumed to be a NUL-terminated string that lives * longer than the symlink itself. */ static void _nfsd_symlink(struct dentry *parent, const char *name, const char *content) { struct inode *dir = parent->d_inode; struct dentry *dentry; int ret; inode_lock(dir); dentry = d_alloc_name(parent, name); if (!dentry) goto out; ret = __nfsd_symlink(d_inode(parent), dentry, S_IFLNK | 0777, content); if (ret) dput(dentry); out: inode_unlock(dir); } #else static inline void _nfsd_symlink(struct dentry *parent, const char *name, const char *content) { } #endif static void clear_ncl(struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct nfsdfs_client *ncl = inode->i_private; spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); kref_put(&ncl->cl_ref, ncl->cl_release); } struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) { struct nfsdfs_client *nc; spin_lock(&inode->i_lock); nc = inode->i_private; if (nc) kref_get(&nc->cl_ref); spin_unlock(&inode->i_lock); return nc; } /* XXX: cut'n'paste from simple_fill_super; figure out if we could share * code instead. */ static int nfsdfs_create_files(struct dentry *root, const struct tree_descr *files, struct nfsdfs_client *ncl, struct dentry **fdentries) { struct inode *dir = d_inode(root); struct inode *inode; struct dentry *dentry; int i; inode_lock(dir); for (i = 0; files->name && files->name[0]; i++, files++) { dentry = d_alloc_name(root, files->name); if (!dentry) goto out; inode = nfsd_get_inode(d_inode(root)->i_sb, S_IFREG | files->mode); if (!inode) { dput(dentry); goto out; } kref_get(&ncl->cl_ref); inode->i_fop = files->ops; inode->i_private = ncl; d_add(dentry, inode); fsnotify_create(dir, dentry); if (fdentries) fdentries[i] = dentry; } inode_unlock(dir); return 0; out: inode_unlock(dir); return -ENOMEM; } /* on success, returns positive number unique to that client. */ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, struct nfsdfs_client *ncl, u32 id, const struct tree_descr *files, struct dentry **fdentries) { struct dentry *dentry; char name[11]; int ret; sprintf(name, "%u", id); dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name); if (IS_ERR(dentry)) /* XXX: tossing errors? */ return NULL; ret = nfsdfs_create_files(dentry, files, ncl, fdentries); if (ret) { nfsd_client_rmdir(dentry); return NULL; } return dentry; } /* Taken from __rpc_rmdir: */ void nfsd_client_rmdir(struct dentry *dentry) { simple_recursive_removal(dentry, clear_ncl); } static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) { struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); struct dentry *dentry; int ret; static const struct tree_descr nfsd_files[] = { [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, /* Per-export io stats use same ops as exports file */ [NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_fops, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_FO_UnlockFS] = {"unlock_filesystem", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &nfsd_reply_cache_stats_fops, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO}, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, #endif [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO}, #endif /* last one */ {""} }; ret = simple_fill_super(sb, 0x6e667364, nfsd_files); if (ret) return ret; _nfsd_symlink(sb->s_root, "supported_krb5_enctypes", "/proc/net/rpc/gss_krb5_enctypes"); dentry = nfsd_mkdir(sb->s_root, NULL, "clients"); if (IS_ERR(dentry)) return PTR_ERR(dentry); nn->nfsd_client_dir = dentry; return 0; } static int nfsd_fs_get_tree(struct fs_context *fc) { return get_tree_keyed(fc, nfsd_fill_super, get_net(fc->net_ns)); } static void nfsd_fs_free_fc(struct fs_context *fc) { if (fc->s_fs_info) put_net(fc->s_fs_info); } static const struct fs_context_operations nfsd_fs_context_ops = { .free = nfsd_fs_free_fc, .get_tree = nfsd_fs_get_tree, }; static int nfsd_init_fs_context(struct fs_context *fc) { put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(fc->net_ns->user_ns); fc->ops = &nfsd_fs_context_ops; return 0; } static void nfsd_umount(struct super_block *sb) { struct net *net = sb->s_fs_info; nfsd_shutdown_threads(net); kill_litter_super(sb); put_net(net); } static struct file_system_type nfsd_fs_type = { .owner = THIS_MODULE, .name = "nfsd", .init_fs_context = nfsd_init_fs_context, .kill_sb = nfsd_umount, }; MODULE_ALIAS_FS("nfsd"); #ifdef CONFIG_PROC_FS static int exports_proc_open(struct inode *inode, struct file *file) { return exports_net_open(current->nsproxy->net_ns, file); } static const struct proc_ops exports_proc_ops = { .proc_open = exports_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, }; static int create_proc_exports_entry(void) { struct proc_dir_entry *entry; entry = proc_mkdir("fs/nfs", NULL); if (!entry) return -ENOMEM; entry = proc_create("exports", 0, entry, &exports_proc_ops); if (!entry) { remove_proc_entry("fs/nfs", NULL); return -ENOMEM; } return 0; } #else /* CONFIG_PROC_FS */ static int create_proc_exports_entry(void) { return 0; } #endif unsigned int nfsd_net_id; static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, struct netlink_callback *cb, struct nfsd_genl_rqstp *rqstp) { void *hdr; u32 i; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &nfsd_nl_family, 0, NFSD_CMD_RPC_STATUS_GET); if (!hdr) return -ENOBUFS; if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) || nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) || nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) || nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) || nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) || nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME, ktime_to_us(rqstp->rq_stime), NFSD_A_RPC_STATUS_PAD)) return -ENOBUFS; switch (rqstp->rq_saddr.sa_family) { case AF_INET: { const struct sockaddr_in *s_in, *d_in; s_in = (const struct sockaddr_in *)&rqstp->rq_saddr; d_in = (const struct sockaddr_in *)&rqstp->rq_daddr; if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4, s_in->sin_addr.s_addr) || nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4, d_in->sin_addr.s_addr) || nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT, s_in->sin_port) || nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT, d_in->sin_port)) return -ENOBUFS; break; } case AF_INET6: { const struct sockaddr_in6 *s_in, *d_in; s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr; d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr; if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6, &s_in->sin6_addr) || nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6, &d_in->sin6_addr) || nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT, s_in->sin6_port) || nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT, d_in->sin6_port)) return -ENOBUFS; break; } } for (i = 0; i < rqstp->rq_opcnt; i++) if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS, rqstp->rq_opnum[i])) return -ENOBUFS; genlmsg_end(skb, hdr); return 0; } /** * nfsd_nl_rpc_status_get_dumpit - Handle rpc_status_get dumpit * @skb: reply buffer * @cb: netlink metadata and command arguments * * Returns the size of the reply or a negative errno. */ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { int i, ret, rqstp_index = 0; struct nfsd_net *nn; mutex_lock(&nfsd_mutex); nn = net_generic(sock_net(skb->sk), nfsd_net_id); if (!nn->nfsd_serv) { ret = -ENODEV; goto out_unlock; } rcu_read_lock(); for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) { struct svc_rqst *rqstp; if (i < cb->args[0]) /* already consumed */ continue; rqstp_index = 0; list_for_each_entry_rcu(rqstp, &nn->nfsd_serv->sv_pools[i].sp_all_threads, rq_all) { struct nfsd_genl_rqstp genl_rqstp; unsigned int status_counter; if (rqstp_index++ < cb->args[1]) /* already consumed */ continue; /* * Acquire rq_status_counter before parsing the rqst * fields. rq_status_counter is set to an odd value in * order to notify the consumers the rqstp fields are * meaningful. */ status_counter = smp_load_acquire(&rqstp->rq_status_counter); if (!(status_counter & 1)) continue; genl_rqstp.rq_xid = rqstp->rq_xid; genl_rqstp.rq_flags = rqstp->rq_flags; genl_rqstp.rq_vers = rqstp->rq_vers; genl_rqstp.rq_prog = rqstp->rq_prog; genl_rqstp.rq_proc = rqstp->rq_proc; genl_rqstp.rq_stime = rqstp->rq_stime; genl_rqstp.rq_opcnt = 0; memcpy(&genl_rqstp.rq_daddr, svc_daddr(rqstp), sizeof(struct sockaddr)); memcpy(&genl_rqstp.rq_saddr, svc_addr(rqstp), sizeof(struct sockaddr)); #ifdef CONFIG_NFSD_V4 if (rqstp->rq_vers == NFS4_VERSION && rqstp->rq_proc == NFSPROC4_COMPOUND) { /* NFSv4 compound */ struct nfsd4_compoundargs *args; int j; args = rqstp->rq_argp; genl_rqstp.rq_opcnt = args->opcnt; for (j = 0; j < genl_rqstp.rq_opcnt; j++) genl_rqstp.rq_opnum[j] = args->ops[j].opnum; } #endif /* CONFIG_NFSD_V4 */ /* * Acquire rq_status_counter before reporting the rqst * fields to the user. */ if (smp_load_acquire(&rqstp->rq_status_counter) != status_counter) continue; ret = nfsd_genl_rpc_status_compose_msg(skb, cb, &genl_rqstp); if (ret) goto out; } } cb->args[0] = i; cb->args[1] = rqstp_index; ret = skb->len; out: rcu_read_unlock(); out_unlock: mutex_unlock(&nfsd_mutex); return ret; } /** * nfsd_nl_threads_set_doit - set the number of running threads * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) { int *nthreads, count = 0, nrpools, i, ret = -EOPNOTSUPP, rem; struct net *net = genl_info_net(info); struct nfsd_net *nn = net_generic(net, nfsd_net_id); const struct nlattr *attr; const char *scope = NULL; if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_THREADS)) return -EINVAL; /* count number of SERVER_THREADS values */ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { if (nla_type(attr) == NFSD_A_SERVER_THREADS) count++; } mutex_lock(&nfsd_mutex); nrpools = max(count, nfsd_nrpools(net)); nthreads = kcalloc(nrpools, sizeof(int), GFP_KERNEL); if (!nthreads) { ret = -ENOMEM; goto out_unlock; } i = 0; nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { if (nla_type(attr) == NFSD_A_SERVER_THREADS) { nthreads[i++] = nla_get_u32(attr); if (i >= nrpools) break; } } if (info->attrs[NFSD_A_SERVER_GRACETIME] || info->attrs[NFSD_A_SERVER_LEASETIME] || info->attrs[NFSD_A_SERVER_SCOPE]) { ret = -EBUSY; if (nn->nfsd_serv && nn->nfsd_serv->sv_nrthreads) goto out_unlock; ret = -EINVAL; attr = info->attrs[NFSD_A_SERVER_GRACETIME]; if (attr) { u32 gracetime = nla_get_u32(attr); if (gracetime < 10 || gracetime > 3600) goto out_unlock; nn->nfsd4_grace = gracetime; } attr = info->attrs[NFSD_A_SERVER_LEASETIME]; if (attr) { u32 leasetime = nla_get_u32(attr); if (leasetime < 10 || leasetime > 3600) goto out_unlock; nn->nfsd4_lease = leasetime; } attr = info->attrs[NFSD_A_SERVER_SCOPE]; if (attr) scope = nla_data(attr); } ret = nfsd_svc(nrpools, nthreads, net, get_current_cred(), scope); if (ret > 0) ret = 0; out_unlock: mutex_unlock(&nfsd_mutex); kfree(nthreads); return ret; } /** * nfsd_nl_threads_get_doit - get the number of running threads * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct nfsd_net *nn = net_generic(net, nfsd_net_id); void *hdr; int err; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_iput(skb, info); if (!hdr) { err = -EMSGSIZE; goto err_free_msg; } mutex_lock(&nfsd_mutex); err = nla_put_u32(skb, NFSD_A_SERVER_GRACETIME, nn->nfsd4_grace) || nla_put_u32(skb, NFSD_A_SERVER_LEASETIME, nn->nfsd4_lease) || nla_put_string(skb, NFSD_A_SERVER_SCOPE, nn->nfsd_name); if (err) goto err_unlock; if (nn->nfsd_serv) { int i; for (i = 0; i < nfsd_nrpools(net); ++i) { struct svc_pool *sp = &nn->nfsd_serv->sv_pools[i]; err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, sp->sp_nrthreads); if (err) goto err_unlock; } } else { err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, 0); if (err) goto err_unlock; } mutex_unlock(&nfsd_mutex); genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err_unlock: mutex_unlock(&nfsd_mutex); err_free_msg: nlmsg_free(skb); return err; } /** * nfsd_nl_version_set_doit - set the nfs enabled versions * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info) { const struct nlattr *attr; struct nfsd_net *nn; int i, rem; if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_PROTO_VERSION)) return -EINVAL; mutex_lock(&nfsd_mutex); nn = net_generic(genl_info_net(info), nfsd_net_id); if (nn->nfsd_serv) { mutex_unlock(&nfsd_mutex); return -EBUSY; } /* clear current supported versions. */ nfsd_vers(nn, 2, NFSD_CLEAR); nfsd_vers(nn, 3, NFSD_CLEAR); for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) nfsd_minorversion(nn, i, NFSD_CLEAR); nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_VERSION_MAX + 1]; u32 major, minor = 0; bool enabled; if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION) continue; if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr, nfsd_version_nl_policy, info->extack) < 0) continue; if (!tb[NFSD_A_VERSION_MAJOR]) continue; major = nla_get_u32(tb[NFSD_A_VERSION_MAJOR]); if (tb[NFSD_A_VERSION_MINOR]) minor = nla_get_u32(tb[NFSD_A_VERSION_MINOR]); enabled = nla_get_flag(tb[NFSD_A_VERSION_ENABLED]); switch (major) { case 4: nfsd_minorversion(nn, minor, enabled ? NFSD_SET : NFSD_CLEAR); break; case 3: case 2: if (!minor) nfsd_vers(nn, major, enabled ? NFSD_SET : NFSD_CLEAR); break; default: break; } } mutex_unlock(&nfsd_mutex); return 0; } /** * nfsd_nl_version_get_doit - get the enabled status for all supported nfs versions * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info) { struct nfsd_net *nn; int i, err; void *hdr; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_iput(skb, info); if (!hdr) { err = -EMSGSIZE; goto err_free_msg; } mutex_lock(&nfsd_mutex); nn = net_generic(genl_info_net(info), nfsd_net_id); for (i = 2; i <= 4; i++) { int j; for (j = 0; j <= NFSD_SUPPORTED_MINOR_VERSION; j++) { struct nlattr *attr; /* Don't record any versions the kernel doesn't have * compiled in */ if (!nfsd_support_version(i)) continue; /* NFSv{2,3} does not support minor numbers */ if (i < 4 && j) continue; attr = nla_nest_start(skb, NFSD_A_SERVER_PROTO_VERSION); if (!attr) { err = -EINVAL; goto err_nfsd_unlock; } if (nla_put_u32(skb, NFSD_A_VERSION_MAJOR, i) || nla_put_u32(skb, NFSD_A_VERSION_MINOR, j)) { err = -EINVAL; goto err_nfsd_unlock; } /* Set the enabled flag if the version is enabled */ if (nfsd_vers(nn, i, NFSD_TEST) && (i < 4 || nfsd_minorversion(nn, j, NFSD_TEST)) && nla_put_flag(skb, NFSD_A_VERSION_ENABLED)) { err = -EINVAL; goto err_nfsd_unlock; } nla_nest_end(skb, attr); } } mutex_unlock(&nfsd_mutex); genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err_nfsd_unlock: mutex_unlock(&nfsd_mutex); err_free_msg: nlmsg_free(skb); return err; } /** * nfsd_nl_listener_set_doit - set the nfs running sockets * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct svc_xprt *xprt, *tmp; const struct nlattr *attr; struct svc_serv *serv; LIST_HEAD(permsocks); struct nfsd_net *nn; int err, rem; mutex_lock(&nfsd_mutex); err = nfsd_create_serv(net); if (err) { mutex_unlock(&nfsd_mutex); return err; } nn = net_generic(net, nfsd_net_id); serv = nn->nfsd_serv; spin_lock_bh(&serv->sv_lock); /* Move all of the old listener sockets to a temp list */ list_splice_init(&serv->sv_permsocks, &permsocks); /* * Walk the list of server_socks from userland and move any that match * back to sv_permsocks */ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; const char *xcl_name; struct sockaddr *sa; if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR) continue; if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, nfsd_sock_nl_policy, info->extack) < 0) continue; if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME]) continue; if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa)) continue; xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]); sa = nla_data(tb[NFSD_A_SOCK_ADDR]); /* Put back any matching sockets */ list_for_each_entry_safe(xprt, tmp, &permsocks, xpt_list) { /* This shouldn't be possible */ if (WARN_ON_ONCE(xprt->xpt_net != net)) { list_move(&xprt->xpt_list, &serv->sv_permsocks); continue; } /* If everything matches, put it back */ if (!strcmp(xprt->xpt_class->xcl_name, xcl_name) && rpc_cmp_addr_port(sa, (struct sockaddr *)&xprt->xpt_local)) { list_move(&xprt->xpt_list, &serv->sv_permsocks); break; } } } /* For now, no removing old sockets while server is running */ if (serv->sv_nrthreads && !list_empty(&permsocks)) { list_splice_init(&permsocks, &serv->sv_permsocks); spin_unlock_bh(&serv->sv_lock); err = -EBUSY; goto out_unlock_mtx; } /* Close the remaining sockets on the permsocks list */ while (!list_empty(&permsocks)) { xprt = list_first_entry(&permsocks, struct svc_xprt, xpt_list); list_move(&xprt->xpt_list, &serv->sv_permsocks); /* * Newly-created sockets are born with the BUSY bit set. Clear * it if there are no threads, since nothing can pick it up * in that case. */ if (!serv->sv_nrthreads) clear_bit(XPT_BUSY, &xprt->xpt_flags); set_bit(XPT_CLOSE, &xprt->xpt_flags); spin_unlock_bh(&serv->sv_lock); svc_xprt_close(xprt); spin_lock_bh(&serv->sv_lock); } spin_unlock_bh(&serv->sv_lock); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; const char *xcl_name; struct sockaddr *sa; int ret; if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR) continue; if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, nfsd_sock_nl_policy, info->extack) < 0) continue; if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME]) continue; if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa)) continue; xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]); sa = nla_data(tb[NFSD_A_SOCK_ADDR]); xprt = svc_find_listener(serv, xcl_name, net, sa); if (xprt) { svc_xprt_put(xprt); continue; } ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, 0, get_current_cred()); /* always save the latest error */ if (ret < 0) err = ret; } if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) nfsd_destroy_serv(net); out_unlock_mtx: mutex_unlock(&nfsd_mutex); return err; } /** * nfsd_nl_listener_get_doit - get the nfs running listeners * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info) { struct svc_xprt *xprt; struct svc_serv *serv; struct nfsd_net *nn; void *hdr; int err; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_iput(skb, info); if (!hdr) { err = -EMSGSIZE; goto err_free_msg; } mutex_lock(&nfsd_mutex); nn = net_generic(genl_info_net(info), nfsd_net_id); /* no nfs server? Just send empty socket list */ if (!nn->nfsd_serv) goto out_unlock_mtx; serv = nn->nfsd_serv; spin_lock_bh(&serv->sv_lock); list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { struct nlattr *attr; attr = nla_nest_start(skb, NFSD_A_SERVER_SOCK_ADDR); if (!attr) { err = -EINVAL; goto err_serv_unlock; } if (nla_put_string(skb, NFSD_A_SOCK_TRANSPORT_NAME, xprt->xpt_class->xcl_name) || nla_put(skb, NFSD_A_SOCK_ADDR, sizeof(struct sockaddr_storage), &xprt->xpt_local)) { err = -EINVAL; goto err_serv_unlock; } nla_nest_end(skb, attr); } spin_unlock_bh(&serv->sv_lock); out_unlock_mtx: mutex_unlock(&nfsd_mutex); genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err_serv_unlock: spin_unlock_bh(&serv->sv_lock); mutex_unlock(&nfsd_mutex); err_free_msg: nlmsg_free(skb); return err; } /** * nfsd_nl_pool_mode_set_doit - set the number of running threads * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_pool_mode_set_doit(struct sk_buff *skb, struct genl_info *info) { const struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, NFSD_A_POOL_MODE_MODE)) return -EINVAL; attr = info->attrs[NFSD_A_POOL_MODE_MODE]; return sunrpc_set_pool_mode(nla_data(attr)); } /** * nfsd_nl_pool_mode_get_doit - get info about pool_mode * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_pool_mode_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); char buf[16]; void *hdr; int err; if (sunrpc_get_pool_mode(buf, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) return -ERANGE; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; err = -EMSGSIZE; hdr = genlmsg_iput(skb, info); if (!hdr) goto err_free_msg; err = nla_put_string(skb, NFSD_A_POOL_MODE_MODE, buf) | nla_put_u32(skb, NFSD_A_POOL_MODE_NPOOLS, nfsd_nrpools(net)); if (err) goto err_free_msg; genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err_free_msg: nlmsg_free(skb); return err; } /** * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace * @net: a freshly-created network namespace * * This information stays around as long as the network namespace is * alive whether or not there is an NFSD instance running in the * namespace. * * Returns zero on success, or a negative errno otherwise. */ static __net_init int nfsd_net_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int retval; int i; retval = nfsd_export_init(net); if (retval) goto out_export_error; retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; retval = percpu_counter_init_many(nn->counter, 0, GFP_KERNEL, NFSD_STATS_COUNTERS_NUM); if (retval) goto out_repcache_error; memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats)); nn->nfsd_svcstats.program = &nfsd_programs[0]; for (i = 0; i < sizeof(nn->nfsd_versions); i++) nn->nfsd_versions[i] = nfsd_support_version(i); for (i = 0; i < sizeof(nn->nfsd4_minorversions); i++) nn->nfsd4_minorversions[i] = nfsd_support_version(4); nn->nfsd_info.mutex = &nfsd_mutex; nn->nfsd_serv = NULL; nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); nfsd_proc_stat_init(net); #if IS_ENABLED(CONFIG_NFS_LOCALIO) INIT_LIST_HEAD(&nn->local_clients); #endif return 0; out_repcache_error: nfsd_idmap_shutdown(net); out_idmap_error: nfsd_export_shutdown(net); out_export_error: return retval; } #if IS_ENABLED(CONFIG_NFS_LOCALIO) /** * nfsd_net_pre_exit - Disconnect localio clients from net namespace * @net: a network namespace that is about to be destroyed * * This invalidated ->net pointers held by localio clients * while they can still safely access nn->counter. */ static __net_exit void nfsd_net_pre_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs_uuid_invalidate_clients(&nn->local_clients); } #endif /** * nfsd_net_exit - Release the nfsd_net portion of a net namespace * @net: a network namespace that is about to be destroyed * */ static __net_exit void nfsd_net_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfsd_proc_stat_shutdown(net); percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); } static struct pernet_operations nfsd_net_ops = { .init = nfsd_net_init, #if IS_ENABLED(CONFIG_NFS_LOCALIO) .pre_exit = nfsd_net_pre_exit, #endif .exit = nfsd_net_exit, .id = &nfsd_net_id, .size = sizeof(struct nfsd_net), }; static int __init init_nfsd(void) { int retval; retval = nfsd4_init_slabs(); if (retval) return retval; retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; retval = nfsd_drc_slab_create(); if (retval) goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ retval = create_proc_exports_entry(); if (retval) goto out_free_lockd; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) goto out_free_exports; retval = register_cld_notifier(); if (retval) goto out_free_subsys; retval = nfsd4_create_laundry_wq(); if (retval) goto out_free_cld; retval = register_filesystem(&nfsd_fs_type); if (retval) goto out_free_all; retval = genl_register_family(&nfsd_nl_family); if (retval) goto out_free_all; nfsd_localio_ops_init(); return 0; out_free_all: nfsd4_destroy_laundry_wq(); out_free_cld: unregister_cld_notifier(); out_free_subsys: unregister_pernet_subsys(&nfsd_net_ops); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); out_free_lockd: nfsd_lockd_shutdown(); nfsd_drc_slab_free(); out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); return retval; } static void __exit exit_nfsd(void) { genl_unregister_family(&nfsd_nl_family); unregister_filesystem(&nfsd_fs_type); nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_DESCRIPTION("In-kernel NFS server"); MODULE_LICENSE("GPL"); module_init(init_nfsd) module_exit(exit_nfsd)
344 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Authentication token and access key management * * Copyright (C) 2004, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/security/keys/core.rst for information on keys/keyrings. */ #ifndef _LINUX_KEY_H #define _LINUX_KEY_H #include <linux/types.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> #include <linux/sysctl.h> #include <linux/rwsem.h> #include <linux/atomic.h> #include <linux/assoc_array.h> #include <linux/refcount.h> #include <linux/time64.h> #ifdef __KERNEL__ #include <linux/uidgid.h> /* key handle serial number */ typedef int32_t key_serial_t; /* key handle permissions mask */ typedef uint32_t key_perm_t; struct key; struct net; #ifdef CONFIG_KEYS #undef KEY_DEBUGGING #define KEY_POS_VIEW 0x01000000 /* possessor can view a key's attributes */ #define KEY_POS_READ 0x02000000 /* possessor can read key payload / view keyring */ #define KEY_POS_WRITE 0x04000000 /* possessor can update key payload / add link to keyring */ #define KEY_POS_SEARCH 0x08000000 /* possessor can find a key in search / search a keyring */ #define KEY_POS_LINK 0x10000000 /* possessor can create a link to a key/keyring */ #define KEY_POS_SETATTR 0x20000000 /* possessor can set key attributes */ #define KEY_POS_ALL 0x3f000000 #define KEY_USR_VIEW 0x00010000 /* user permissions... */ #define KEY_USR_READ 0x00020000 #define KEY_USR_WRITE 0x00040000 #define KEY_USR_SEARCH 0x00080000 #define KEY_USR_LINK 0x00100000 #define KEY_USR_SETATTR 0x00200000 #define KEY_USR_ALL 0x003f0000 #define KEY_GRP_VIEW 0x00000100 /* group permissions... */ #define KEY_GRP_READ 0x00000200 #define KEY_GRP_WRITE 0x00000400 #define KEY_GRP_SEARCH 0x00000800 #define KEY_GRP_LINK 0x00001000 #define KEY_GRP_SETATTR 0x00002000 #define KEY_GRP_ALL 0x00003f00 #define KEY_OTH_VIEW 0x00000001 /* third party permissions... */ #define KEY_OTH_READ 0x00000002 #define KEY_OTH_WRITE 0x00000004 #define KEY_OTH_SEARCH 0x00000008 #define KEY_OTH_LINK 0x00000010 #define KEY_OTH_SETATTR 0x00000020 #define KEY_OTH_ALL 0x0000003f #define KEY_PERM_UNDEF 0xffffffff /* * The permissions required on a key that we're looking up. */ enum key_need_perm { KEY_NEED_UNSPECIFIED, /* Needed permission unspecified */ KEY_NEED_VIEW, /* Require permission to view attributes */ KEY_NEED_READ, /* Require permission to read content */ KEY_NEED_WRITE, /* Require permission to update / modify */ KEY_NEED_SEARCH, /* Require permission to search (keyring) or find (key) */ KEY_NEED_LINK, /* Require permission to link */ KEY_NEED_SETATTR, /* Require permission to change attributes */ KEY_NEED_UNLINK, /* Require permission to unlink key */ KEY_SYSADMIN_OVERRIDE, /* Special: override by CAP_SYS_ADMIN */ KEY_AUTHTOKEN_OVERRIDE, /* Special: override by possession of auth token */ KEY_DEFER_PERM_CHECK, /* Special: permission check is deferred */ }; enum key_lookup_flag { KEY_LOOKUP_CREATE = 0x01, KEY_LOOKUP_PARTIAL = 0x02, KEY_LOOKUP_ALL = (KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL), }; struct seq_file; struct user_struct; struct signal_struct; struct cred; struct key_type; struct key_owner; struct key_tag; struct keyring_list; struct keyring_name; struct key_tag { struct rcu_head rcu; refcount_t usage; bool removed; /* T when subject removed */ }; struct keyring_index_key { /* [!] If this structure is altered, the union in struct key must change too! */ unsigned long hash; /* Hash value */ union { struct { #ifdef __LITTLE_ENDIAN /* Put desc_len at the LSB of x */ u16 desc_len; char desc[sizeof(long) - 2]; /* First few chars of description */ #else char desc[sizeof(long) - 2]; /* First few chars of description */ u16 desc_len; #endif }; unsigned long x; }; struct key_type *type; struct key_tag *domain_tag; /* Domain of operation */ const char *description; }; union key_payload { void __rcu *rcu_data0; void *data[4]; }; /*****************************************************************************/ /* * key reference with possession attribute handling * * NOTE! key_ref_t is a typedef'd pointer to a type that is not actually * defined. This is because we abuse the bottom bit of the reference to carry a * flag to indicate whether the calling process possesses that key in one of * its keyrings. * * the key_ref_t has been made a separate type so that the compiler can reject * attempts to dereference it without proper conversion. * * the three functions are used to assemble and disassemble references */ typedef struct __key_reference_with_attributes *key_ref_t; static inline key_ref_t make_key_ref(const struct key *key, bool possession) { return (key_ref_t) ((unsigned long) key | possession); } static inline struct key *key_ref_to_ptr(const key_ref_t key_ref) { return (struct key *) ((unsigned long) key_ref & ~1UL); } static inline bool is_key_possessed(const key_ref_t key_ref) { return (unsigned long) key_ref & 1UL; } typedef int (*key_restrict_link_func_t)(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key); struct key_restriction { key_restrict_link_func_t check; struct key *key; struct key_type *keytype; }; enum key_state { KEY_IS_UNINSTANTIATED, KEY_IS_POSITIVE, /* Positively instantiated */ }; /*****************************************************************************/ /* * authentication token / access credential / keyring * - types of key include: * - keyrings * - disk encryption IDs * - Kerberos TGTs and tickets */ struct key { refcount_t usage; /* number of references */ key_serial_t serial; /* key serial number */ union { struct list_head graveyard_link; struct rb_node serial_node; }; #ifdef CONFIG_KEY_NOTIFICATIONS struct watch_list *watchers; /* Entities watching this key for changes */ #endif struct rw_semaphore sem; /* change vs change sem */ struct key_user *user; /* owner of this key */ void *security; /* security data for this key */ union { time64_t expiry; /* time at which key expires (or 0) */ time64_t revoked_at; /* time at which key was revoked */ }; time64_t last_used_at; /* last time used for LRU keyring discard */ kuid_t uid; kgid_t gid; key_perm_t perm; /* access permissions */ unsigned short quotalen; /* length added to quota */ unsigned short datalen; /* payload data length * - may not match RCU dereferenced payload * - payload should contain own length */ short state; /* Key state (+) or rejection error (-) */ #ifdef KEY_DEBUGGING unsigned magic; #define KEY_DEBUG_MAGIC 0x18273645u #endif unsigned long flags; /* status flags (change with bitops) */ #define KEY_FLAG_DEAD 0 /* set if key type has been deleted */ #define KEY_FLAG_REVOKED 1 /* set if key had been revoked */ #define KEY_FLAG_IN_QUOTA 2 /* set if key consumes quota */ #define KEY_FLAG_USER_CONSTRUCT 3 /* set if key is being constructed in userspace */ #define KEY_FLAG_ROOT_CAN_CLEAR 4 /* set if key can be cleared by root without permission */ #define KEY_FLAG_INVALIDATED 5 /* set if key has been invalidated */ #define KEY_FLAG_BUILTIN 6 /* set if key is built in to the kernel */ #define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 8 /* set if key should not be removed */ #define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria * - it should be a printable string * - eg: for krb5 AFS, this might be "afs@REDHAT.COM" */ union { struct keyring_index_key index_key; struct { unsigned long hash; unsigned long len_desc; struct key_type *type; /* type of key */ struct key_tag *domain_tag; /* Domain of operation */ char *description; }; }; /* key data * - this is used to hold the data actually used in cryptography or * whatever */ union { union key_payload payload; struct { /* Keyring bits */ struct list_head name_link; struct assoc_array keys; }; }; /* This is set on a keyring to restrict the addition of a link to a key * to it. If this structure isn't provided then it is assumed that the * keyring is open to any addition. It is ignored for non-keyring * keys. Only set this value using keyring_restrict(), keyring_alloc(), * or key_alloc(). * * This is intended for use with rings of trusted keys whereby addition * to the keyring needs to be controlled. KEY_ALLOC_BYPASS_RESTRICTION * overrides this, allowing the kernel to add extra keys without * restriction. */ struct key_restriction *restrict_link; }; extern struct key *key_alloc(struct key_type *type, const char *desc, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link); #define KEY_ALLOC_IN_QUOTA 0x0000 /* add to quota, reject if would overrun */ #define KEY_ALLOC_QUOTA_OVERRUN 0x0001 /* add to quota, permit even if overrun */ #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ #define KEY_ALLOC_BUILT_IN 0x0004 /* Key is built into kernel */ #define KEY_ALLOC_BYPASS_RESTRICTION 0x0008 /* Override the check on restricted keyrings */ #define KEY_ALLOC_UID_KEYRING 0x0010 /* allocating a user or user session keyring */ #define KEY_ALLOC_SET_KEEP 0x0020 /* Set the KEEP flag on the key/keyring */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); extern void key_put(struct key *key); extern bool key_put_tag(struct key_tag *tag); extern void key_remove_domain(struct key_tag *domain_tag); static inline struct key *__key_get(struct key *key) { refcount_inc(&key->usage); return key; } static inline struct key *key_get(struct key *key) { return key ? __key_get(key) : key; } static inline void key_ref_put(key_ref_t key_ref) { key_put(key_ref_to_ptr(key_ref)); } extern struct key *request_key_tag(struct key_type *type, const char *description, struct key_tag *domain_tag, const char *callout_info); extern struct key *request_key_rcu(struct key_type *type, const char *description, struct key_tag *domain_tag); extern struct key *request_key_with_auxdata(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux); /** * request_key - Request a key and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @callout_info: The data to pass to the instantiation upcall (or NULL). * * As for request_key_tag(), but with the default global domain tag. */ static inline struct key *request_key(struct key_type *type, const char *description, const char *callout_info) { return request_key_tag(type, description, NULL, callout_info); } #ifdef CONFIG_NET /** * request_key_net - Request a key for a net namespace and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @net: The network namespace that is the key's domain of operation. * @callout_info: The data to pass to the instantiation upcall (or NULL). * * As for request_key() except that it does not add the returned key to a * keyring if found, new keys are always allocated in the user's quota, the * callout_info must be a NUL-terminated string and no auxiliary data can be * passed. Only keys that operate the specified network namespace are used. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ #define request_key_net(type, description, net, callout_info) \ request_key_tag(type, description, net->key_domain, callout_info) /** * request_key_net_rcu - Request a key for a net namespace under RCU conditions * @type: Type of key. * @description: The searchable description of the key. * @net: The network namespace that is the key's domain of operation. * * As for request_key_rcu() except that only keys that operate the specified * network namespace are used. */ #define request_key_net_rcu(type, description, net) \ request_key_rcu(type, description, net->key_domain) #endif /* CONFIG_NET */ extern int wait_for_key_construction(struct key *key, bool intr); extern int key_validate(const struct key *key); extern key_ref_t key_create(key_ref_t keyring, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags); extern key_ref_t key_create_or_update(key_ref_t keyring, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags); extern int key_update(key_ref_t key, const void *payload, size_t plen); extern int key_link(struct key *keyring, struct key *key); extern int key_move(struct key *key, struct key *from_keyring, struct key *to_keyring, unsigned int flags); extern int key_unlink(struct key *keyring, struct key *key); extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest); extern int restrict_link_reject(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key); extern int keyring_clear(struct key *keyring); extern key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description, bool recurse); extern int keyring_restrict(key_ref_t keyring, const char *type, const char *restriction); extern struct key *key_lookup(key_serial_t id); static inline key_serial_t key_serial(const struct key *key) { return key ? key->serial : 0; } extern void key_set_timeout(struct key *, unsigned); extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags, enum key_need_perm need_perm); extern void key_free_user_ns(struct user_namespace *); static inline short key_read_state(const struct key *key) { /* Barrier versus mark_key_instantiated(). */ return smp_load_acquire(&key->state); } /** * key_is_positive - Determine if a key has been positively instantiated * @key: The key to check. * * Return true if the specified key has been positively instantiated, false * otherwise. */ static inline bool key_is_positive(const struct key *key) { return key_read_state(key) == KEY_IS_POSITIVE; } static inline bool key_is_negative(const struct key *key) { return key_read_state(key) < 0; } #define dereference_key_rcu(KEY) \ (rcu_dereference((KEY)->payload.rcu_data0)) #define dereference_key_locked(KEY) \ (rcu_dereference_protected((KEY)->payload.rcu_data0, \ rwsem_is_locked(&((struct key *)(KEY))->sem))) #define rcu_assign_keypointer(KEY, PAYLOAD) \ do { \ rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD)); \ } while (0) /* * the userspace interface */ extern int install_thread_keyring_to_cred(struct cred *cred); extern void key_fsuid_changed(struct cred *new_cred); extern void key_fsgid_changed(struct cred *new_cred); extern void key_init(void); #else /* CONFIG_KEYS */ #define key_validate(k) 0 #define key_serial(k) 0 #define key_get(k) ({ NULL; }) #define key_revoke(k) do { } while(0) #define key_invalidate(k) do { } while(0) #define key_put(k) do { } while(0) #define key_ref_put(k) do { } while(0) #define make_key_ref(k, p) NULL #define key_ref_to_ptr(k) NULL #define is_key_possessed(k) 0 #define key_fsuid_changed(c) do { } while(0) #define key_fsgid_changed(c) do { } while(0) #define key_init() do { } while(0) #define key_free_user_ns(ns) do { } while(0) #define key_remove_domain(d) do { } while(0) #define key_lookup(k) NULL #endif /* CONFIG_KEYS */ #endif /* __KERNEL__ */ #endif /* _LINUX_KEY_H */
45 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright 2020 NXP */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_gate.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_gate_ops; static ktime_t gate_get_time(struct tcf_gate *gact) { ktime_t mono = ktime_get(); switch (gact->tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, gact->tk_offset); } return KTIME_MAX; } static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start) { struct tcf_gate_params *param = &gact->param; ktime_t now, base, cycle; u64 n; base = ns_to_ktime(param->tcfg_basetime); now = gate_get_time(gact); if (ktime_after(base, now)) { *start = base; return; } cycle = param->tcfg_cycletime; n = div64_u64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); } static void gate_start_timer(struct tcf_gate *gact, ktime_t start) { ktime_t expires; expires = hrtimer_get_expires(&gact->hitimer); if (expires == 0) expires = KTIME_MAX; start = min_t(ktime_t, start, expires); hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT); } static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) { struct tcf_gate *gact = container_of(timer, struct tcf_gate, hitimer); struct tcf_gate_params *p = &gact->param; struct tcfg_gate_entry *next; ktime_t close_time, now; spin_lock(&gact->tcf_lock); next = gact->next_entry; /* cycle start, clear pending bit, clear total octets */ gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0; gact->current_entry_octets = 0; gact->current_max_octets = next->maxoctets; gact->current_close_time = ktime_add_ns(gact->current_close_time, next->interval); close_time = gact->current_close_time; if (list_is_last(&next->list, &p->entries)) next = list_first_entry(&p->entries, struct tcfg_gate_entry, list); else next = list_next_entry(next, list); now = gate_get_time(gact); if (ktime_after(now, close_time)) { ktime_t cycle, base; u64 n; cycle = p->tcfg_cycletime; base = ns_to_ktime(p->tcfg_basetime); n = div64_u64(ktime_sub_ns(now, base), cycle); close_time = ktime_add_ns(base, (n + 1) * cycle); } gact->next_entry = next; hrtimer_set_expires(&gact->hitimer, close_time); spin_unlock(&gact->tcf_lock); return HRTIMER_RESTART; } TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gate *gact = to_gate(a); int action = READ_ONCE(gact->tcf_action); tcf_lastuse_update(&gact->tcf_tm); tcf_action_update_bstats(&gact->common, skb); spin_lock(&gact->tcf_lock); if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { spin_unlock(&gact->tcf_lock); return action; } if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) { spin_unlock(&gact->tcf_lock); goto drop; } if (gact->current_max_octets >= 0) { gact->current_entry_octets += qdisc_pkt_len(skb); if (gact->current_entry_octets > gact->current_max_octets) { spin_unlock(&gact->tcf_lock); goto overlimit; } } spin_unlock(&gact->tcf_lock); return action; overlimit: tcf_action_inc_overlimit_qstats(&gact->common); drop: tcf_action_inc_drop_qstats(&gact->common); return TC_ACT_SHOT; } static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = { [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG }, [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 }, [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 }, }; static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = { [TCA_GATE_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_gate)), [TCA_GATE_PRIORITY] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_GATE_BASE_TIME] = { .type = NLA_U64 }, [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 }, [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 }, [TCA_GATE_FLAGS] = { .type = NLA_U32 }, [TCA_GATE_CLOCKID] = { .type = NLA_S32 }, }; static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, struct netlink_ext_ack *extack) { u32 interval = 0; entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]); if (tb[TCA_GATE_ENTRY_INTERVAL]) interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]); if (interval == 0) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; entry->ipv = nla_get_s32_default(tb[TCA_GATE_ENTRY_IPV], -1); entry->maxoctets = nla_get_s32_default(tb[TCA_GATE_ENTRY_MAX_OCTETS], -1); return 0; } static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_gate_entry(tb, entry, extack); } static void release_entry_list(struct list_head *entries) { struct tcfg_gate_entry *entry, *e; list_for_each_entry_safe(entry, e, entries, list) { list_del(&entry->list); kfree(entry); } } static int parse_gate_list(struct nlattr *list_attr, struct tcf_gate_params *sched, struct netlink_ext_ack *extack) { struct tcfg_gate_entry *entry; struct nlattr *n; int err, rem; int i = 0; if (!list_attr) return -EINVAL; nla_for_each_nested(n, list_attr, rem) { if (nla_type(n) != TCA_GATE_ONE_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); err = -ENOMEM; goto release_list; } err = parse_gate_entry(n, entry, i, extack); if (err < 0) { kfree(entry); goto release_list; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; release_list: release_entry_list(&sched->entries); return err; } static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, enum tk_offsets tko, s32 clockid, bool do_init) { if (!do_init) { if (basetime == gact->param.tcfg_basetime && tko == gact->tk_offset && clockid == gact->param.tcfg_clockid) return; spin_unlock_bh(&gact->tcf_lock); hrtimer_cancel(&gact->hitimer); spin_lock_bh(&gact->tcf_lock); } gact->param.tcfg_basetime = basetime; gact->param.tcfg_clockid = clockid; gact->tk_offset = tko; hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); gact->hitimer.function = gate_timer_func; } static int tcf_gate_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); enum tk_offsets tk_offset = TK_OFFS_TAI; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; struct tcf_chain *goto_ch = NULL; u64 cycletime = 0, basetime = 0; struct tcf_gate_params *p; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; int ret = 0, err; u32 gflags = 0; s32 prio = -1; ktime_t start; u32 index; if (!nla) return -EINVAL; err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack); if (err < 0) return err; if (!tb[TCA_GATE_PARMS]) return -EINVAL; if (tb[TCA_GATE_CLOCKID]) { clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); return -EINVAL; } } parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; if (err && bind) return ACT_P_BOUND; if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_gate_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); if (tb[TCA_GATE_BASE_TIME]) basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]); if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); gact = to_gate(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&gact->param.entries); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; spin_lock_bh(&gact->tcf_lock); p = &gact->param; if (tb[TCA_GATE_CYCLE_TIME]) cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); if (tb[TCA_GATE_ENTRY_LIST]) { err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); if (err < 0) goto chain_put; } if (!cycletime) { struct tcfg_gate_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); cycletime = cycle; if (!cycletime) { err = -EINVAL; goto chain_put; } } p->tcfg_cycletime = cycletime; if (tb[TCA_GATE_CYCLE_TIME_EXT]) p->tcfg_cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); gate_setup_timer(gact, basetime, tk_offset, clockid, ret == ACT_P_CREATED); p->tcfg_priority = prio; p->tcfg_flags = gflags; gate_get_start_time(gact, &start); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; gact->next_entry = list_first_entry(&p->entries, struct tcfg_gate_entry, list); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); gate_start_timer(gact, start); spin_unlock_bh(&gact->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; chain_put: spin_unlock_bh(&gact->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: /* action is not inserted in any list: it's safe to init hitimer * without taking tcf_lock. */ if (ret == ACT_P_CREATED) gate_setup_timer(gact, gact->param.tcfg_basetime, gact->tk_offset, gact->param.tcfg_clockid, true); tcf_idr_release(*a, bind); return err; } static void tcf_gate_cleanup(struct tc_action *a) { struct tcf_gate *gact = to_gate(a); struct tcf_gate_params *p; p = &gact->param; hrtimer_cancel(&gact->hitimer); release_entry_list(&p->entries); } static int dumping_entry(struct sk_buff *skb, struct tcfg_gate_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv)) goto nla_put_failure; return nla_nest_end(skb, item); nla_put_failure: nla_nest_cancel(skb, item); return -1; } static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_gate *gact = to_gate(a); struct tc_gate opt = { .index = gact->tcf_index, .refcnt = refcount_read(&gact->tcf_refcnt) - ref, .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, }; struct tcfg_gate_entry *entry; struct tcf_gate_params *p; struct nlattr *entry_list; struct tcf_t t; spin_lock_bh(&gact->tcf_lock); opt.action = gact->tcf_action; p = &gact->param; if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME, p->tcfg_basetime, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME, p->tcfg_cycletime, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT, p->tcfg_cycletime_ext, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority)) goto nla_put_failure; entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST); if (!entry_list) goto nla_put_failure; list_for_each_entry(entry, &p->entries, list) { if (dumping_entry(skb, entry) < 0) goto nla_put_failure; } nla_nest_end(skb, entry_list); tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) goto nla_put_failure; spin_unlock_bh(&gact->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&gact->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_gate *gact = to_gate(a); struct tcf_t *tm = &gact->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static size_t tcf_gate_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_gate)); } static void tcf_gate_entry_destructor(void *priv) { struct action_gate_entry *oe = priv; kfree(oe); } static int tcf_gate_get_entries(struct flow_action_entry *entry, const struct tc_action *act) { entry->gate.entries = tcf_gate_get_list(act); if (!entry->gate.entries) return -EINVAL; entry->destructor = tcf_gate_entry_destructor; entry->destructor_priv = entry->gate.entries; return 0; } static int tcf_gate_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { int err; if (bind) { struct flow_action_entry *entry = entry_data; entry->id = FLOW_ACTION_GATE; entry->gate.prio = tcf_gate_prio(act); entry->gate.basetime = tcf_gate_basetime(act); entry->gate.cycletime = tcf_gate_cycletime(act); entry->gate.cycletimeext = tcf_gate_cycletimeext(act); entry->gate.num_entries = tcf_gate_num_entries(act); err = tcf_gate_get_entries(entry, act); if (err) return err; *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_GATE; } return 0; } static struct tc_action_ops act_gate_ops = { .kind = "gate", .id = TCA_ID_GATE, .owner = THIS_MODULE, .act = tcf_gate_act, .dump = tcf_gate_dump, .init = tcf_gate_init, .cleanup = tcf_gate_cleanup, .stats_update = tcf_gate_stats_update, .get_fill_size = tcf_gate_get_fill_size, .offload_act_setup = tcf_gate_offload_act_setup, .size = sizeof(struct tcf_gate), }; MODULE_ALIAS_NET_ACT("gate"); static __net_init int gate_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); return tc_action_net_init(net, tn, &act_gate_ops); } static void __net_exit gate_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_gate_ops.net_id); } static struct pernet_operations gate_net_ops = { .init = gate_init_net, .exit_batch = gate_exit_net, .id = &act_gate_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init gate_init_module(void) { return tcf_register_action(&act_gate_ops, &gate_net_ops); } static void __exit gate_cleanup_module(void) { tcf_unregister_action(&act_gate_ops, &gate_net_ops); } module_init(gate_init_module); module_exit(gate_cleanup_module); MODULE_DESCRIPTION("TC gate action"); MODULE_LICENSE("GPL v2");
25 24 11 11 11 11 11 11 11 11 11 10 11 63 64 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2006, Johannes Berg <johannes@sipsolutions.net> */ /* just for IFNAMSIZ */ #include <linux/if.h> #include <linux/slab.h> #include <linux/export.h> #include "led.h" void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { if (!atomic_read(&local->assoc_led_active)) return; if (associated) led_trigger_event(&local->assoc_led, LED_FULL); else led_trigger_event(&local->assoc_led, LED_OFF); } void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { if (!atomic_read(&local->radio_led_active)) return; if (enabled) led_trigger_event(&local->radio_led, LED_FULL); else led_trigger_event(&local->radio_led, LED_OFF); } void ieee80211_alloc_led_names(struct ieee80211_local *local) { local->rx_led.name = kasprintf(GFP_KERNEL, "%srx", wiphy_name(local->hw.wiphy)); local->tx_led.name = kasprintf(GFP_KERNEL, "%stx", wiphy_name(local->hw.wiphy)); local->assoc_led.name = kasprintf(GFP_KERNEL, "%sassoc", wiphy_name(local->hw.wiphy)); local->radio_led.name = kasprintf(GFP_KERNEL, "%sradio", wiphy_name(local->hw.wiphy)); } void ieee80211_free_led_names(struct ieee80211_local *local) { kfree(local->rx_led.name); kfree(local->tx_led.name); kfree(local->assoc_led.name); kfree(local->radio_led.name); } static int ieee80211_tx_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tx_led); atomic_inc(&local->tx_led_active); return 0; } static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tx_led); atomic_dec(&local->tx_led_active); } static int ieee80211_rx_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, rx_led); atomic_inc(&local->rx_led_active); return 0; } static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, rx_led); atomic_dec(&local->rx_led_active); } static int ieee80211_assoc_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, assoc_led); atomic_inc(&local->assoc_led_active); return 0; } static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, assoc_led); atomic_dec(&local->assoc_led_active); } static int ieee80211_radio_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, radio_led); atomic_inc(&local->radio_led_active); return 0; } static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, radio_led); atomic_dec(&local->radio_led_active); } static int ieee80211_tpt_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tpt_led); atomic_inc(&local->tpt_led_active); return 0; } static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tpt_led); atomic_dec(&local->tpt_led_active); } void ieee80211_led_init(struct ieee80211_local *local) { atomic_set(&local->rx_led_active, 0); local->rx_led.activate = ieee80211_rx_led_activate; local->rx_led.deactivate = ieee80211_rx_led_deactivate; if (local->rx_led.name && led_trigger_register(&local->rx_led)) { kfree(local->rx_led.name); local->rx_led.name = NULL; } atomic_set(&local->tx_led_active, 0); local->tx_led.activate = ieee80211_tx_led_activate; local->tx_led.deactivate = ieee80211_tx_led_deactivate; if (local->tx_led.name && led_trigger_register(&local->tx_led)) { kfree(local->tx_led.name); local->tx_led.name = NULL; } atomic_set(&local->assoc_led_active, 0); local->assoc_led.activate = ieee80211_assoc_led_activate; local->assoc_led.deactivate = ieee80211_assoc_led_deactivate; if (local->assoc_led.name && led_trigger_register(&local->assoc_led)) { kfree(local->assoc_led.name); local->assoc_led.name = NULL; } atomic_set(&local->radio_led_active, 0); local->radio_led.activate = ieee80211_radio_led_activate; local->radio_led.deactivate = ieee80211_radio_led_deactivate; if (local->radio_led.name && led_trigger_register(&local->radio_led)) { kfree(local->radio_led.name); local->radio_led.name = NULL; } atomic_set(&local->tpt_led_active, 0); if (local->tpt_led_trigger) { local->tpt_led.activate = ieee80211_tpt_led_activate; local->tpt_led.deactivate = ieee80211_tpt_led_deactivate; if (led_trigger_register(&local->tpt_led)) { kfree(local->tpt_led_trigger); local->tpt_led_trigger = NULL; } } } void ieee80211_led_exit(struct ieee80211_local *local) { if (local->radio_led.name) led_trigger_unregister(&local->radio_led); if (local->assoc_led.name) led_trigger_unregister(&local->assoc_led); if (local->tx_led.name) led_trigger_unregister(&local->tx_led); if (local->rx_led.name) led_trigger_unregister(&local->rx_led); if (local->tpt_led_trigger) { led_trigger_unregister(&local->tpt_led); kfree(local->tpt_led_trigger); } } const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->radio_led.name; } EXPORT_SYMBOL(__ieee80211_get_radio_led_name); const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->assoc_led.name; } EXPORT_SYMBOL(__ieee80211_get_assoc_led_name); const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->tx_led.name; } EXPORT_SYMBOL(__ieee80211_get_tx_led_name); const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->rx_led.name; } EXPORT_SYMBOL(__ieee80211_get_rx_led_name); static unsigned long tpt_trig_traffic(struct ieee80211_local *local, struct tpt_led_trigger *tpt_trig) { unsigned long traffic, delta; traffic = tpt_trig->tx_bytes + tpt_trig->rx_bytes; delta = traffic - tpt_trig->prev_traffic; tpt_trig->prev_traffic = traffic; return DIV_ROUND_UP(delta, 1024 / 8); } static void tpt_trig_timer(struct timer_list *t) { struct tpt_led_trigger *tpt_trig = from_timer(tpt_trig, t, timer); struct ieee80211_local *local = tpt_trig->local; unsigned long on, off, tpt; int i; if (!tpt_trig->running) return; mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); tpt = tpt_trig_traffic(local, tpt_trig); /* default to just solid on */ on = 1; off = 0; for (i = tpt_trig->blink_table_len - 1; i >= 0; i--) { if (tpt_trig->blink_table[i].throughput < 0 || tpt > tpt_trig->blink_table[i].throughput) { off = tpt_trig->blink_table[i].blink_time / 2; on = tpt_trig->blink_table[i].blink_time - off; break; } } led_trigger_blink(&local->tpt_led, on, off); } const char * __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len) { struct ieee80211_local *local = hw_to_local(hw); struct tpt_led_trigger *tpt_trig; if (WARN_ON(local->tpt_led_trigger)) return NULL; tpt_trig = kzalloc(sizeof(struct tpt_led_trigger), GFP_KERNEL); if (!tpt_trig) return NULL; snprintf(tpt_trig->name, sizeof(tpt_trig->name), "%stpt", wiphy_name(local->hw.wiphy)); local->tpt_led.name = tpt_trig->name; tpt_trig->blink_table = blink_table; tpt_trig->blink_table_len = blink_table_len; tpt_trig->want = flags; tpt_trig->local = local; timer_setup(&tpt_trig->timer, tpt_trig_timer, 0); local->tpt_led_trigger = tpt_trig; return tpt_trig->name; } EXPORT_SYMBOL(__ieee80211_create_tpt_led_trigger); static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; if (tpt_trig->running) return; /* reset traffic */ tpt_trig_traffic(local, tpt_trig); tpt_trig->running = true; tpt_trig_timer(&tpt_trig->timer); mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); } static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; if (!tpt_trig->running) return; tpt_trig->running = false; del_timer_sync(&tpt_trig->timer); led_trigger_event(&local->tpt_led, LED_OFF); } void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; bool allowed; WARN_ON(types_on & types_off); if (!tpt_trig) return; tpt_trig->active &= ~types_off; tpt_trig->active |= types_on; /* * Regardless of wanted state, we shouldn't blink when * the radio is disabled -- this can happen due to some * code ordering issues with __ieee80211_recalc_idle() * being called before the radio is started. */ allowed = tpt_trig->active & IEEE80211_TPT_LEDTRIG_FL_RADIO; if (!allowed || !(tpt_trig->active & tpt_trig->want)) ieee80211_stop_tpt_led_trig(local); else ieee80211_start_tpt_led_trig(local); }
42 42 42 42 42 42 42 42 42 42 45 39 42 42 42 42 42 15 86 87 80 15 61 61 29 29 75 37 75 52 52 37 75 75 75 75 75 75 37 37 75 75 37 37 75 51 51 51 122 2 122 1 122 37 37 37 37 37 37 11 11 11 10 10 10 10 188 2 188 189 189 188 188 189 189 75 75 75 75 47 75 75 75 75 37 43 41 75 75 47 187 189 189 188 189 189 19 19 19 207 207 207 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <crypto/sha2.h> #include <crypto/utils.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/ip6_route.h> #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> #include <trace/events/sock.h> static void mptcp_subflow_ops_undo_override(struct sock *ssk); static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, enum linux_mptcp_mib_field field) { MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); } static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); pr_debug("subflow_req=%p\n", subflow_req); if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); mptcp_token_destroy_request(req); } static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, void *hmac) { u8 msg[8]; put_unaligned_be32(nonce1, &msg[0]); put_unaligned_be32(nonce2, &msg[4]); mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); } static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) { return mptcp_is_fully_established((void *)msk) && ((mptcp_pm_is_userspace(msk) && mptcp_userspace_pm_active(msk)) || READ_ONCE(msk->pm.accept_subflow)); } /* validate received token and create truncated hmac and nonce for SYN-ACK */ static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req) { struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); subflow_generate_hmac(READ_ONCE(msk->local_key), READ_ONCE(msk->remote_key), subflow_req->local_nonce, subflow_req->remote_nonce, hmac); subflow_req->thmac = get_unaligned_be64(hmac); } static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_sock *msk; int local_id; msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return NULL; } local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); if (local_id < 0) { sock_put((struct sock *)msk); return NULL; } subflow_req->local_id = local_id; subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req); return msk; } static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) { struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (mpext) { memset(mpext, 0, sizeof(*mpext)); mpext->reset_reason = reason; } } static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset * should be sent. */ static int subflow_check_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; pr_debug("subflow_req=%p, listener=%p\n", subflow_req, listener); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EINVAL; } #endif mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN); if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (unlikely(listener->pm_listener)) return subflow_reset_req_endp(req, skb); if (opt_mp_join) return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); if (mp_opt.backup) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); } else if (unlikely(listener->pm_listener)) { return subflow_reset_req_endp(req, skb); } if (opt_mp_capable && listener->request_mptcp) { int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: do { get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); } while (subflow_req->local_key == 0); if (unlikely(req->syncookie)) { mptcp_crypto_key_sha(subflow_req->local_key, &subflow_req->token, &subflow_req->idsn); if (mptcp_token_exists(subflow_req->token)) { if (retries-- > 0) goto again; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else { subflow_req->mp_capable = 1; } return 0; } err = mptcp_token_new_request(req); if (err == 0) subflow_req->mp_capable = 1; else if (retries-- > 0) goto again; else SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else if (opt_mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; subflow_req->backup = mp_opt.backup; subflow_req->remote_id = mp_opt.join_id; subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req); /* Can't fall back to TCP in this case. */ if (!subflow_req->msk) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EPERM; } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { pr_debug("syn inet_sport=%d %d\n", ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); } subflow_req_create_thmac(subflow_req); if (unlikely(req->syncookie)) { if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } subflow_init_req_cookie_join_save(subflow_req, skb); } pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; int err; subflow_init_req(req, sk_listener); mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK); if (opt_mp_capable && opt_mp_join) return -EINVAL; if (opt_mp_capable && listener->request_mptcp) { if (mp_opt.sndr_key == 0) return -EINVAL; subflow_req->local_key = mp_opt.rcvr_key; err = mptcp_token_new_request(req); if (err) return err; subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } else if (opt_mp_join && listener->request_mptcp) { if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) return -EINVAL; subflow_req->mp_join = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } return 0; } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); static enum sk_rst_reason mptcp_get_rst_reason(const struct sk_buff *skb) { const struct mptcp_ext *mpext = mptcp_get_ext(skb); if (!mpext) return SK_RST_REASON_NOT_SPECIFIED; return sk_rst_convert_mptcp_reason(mpext->reset_reason); } static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp_request_sock_ops.send_reset(sk, skb, mptcp_get_rst_reason(skb)); return NULL; } static void subflow_prep_synack(const struct sock *sk, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_request_sock *ireq = inet_rsk(req); /* clear tstamp_ok, as needed depending on cookie */ if (foc && foc->len > -1) ireq->tstamp_ok = 0; if (synack_type == TCP_SYNACK_FASTOPEN) mptcp_fastopen_subflow_synack_set_params(subflow, req); } static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { subflow_prep_synack(sk, req, foc, synack_type); return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc, synack_type, syn_skb); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { subflow_prep_synack(sk, req, foc, synack_type); return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc, synack_type, syn_skb); } static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp6_request_sock_ops.send_reset(sk, skb, mptcp_get_rst_reason(skb)); return NULL; } #endif /* validate received truncated hmac and create hmac for third ACK */ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) { u8 hmac[SHA256_DIGEST_SIZE]; u64 thmac; subflow_generate_hmac(subflow->remote_key, subflow->local_key, subflow->remote_nonce, subflow->local_nonce, hmac); thmac = get_unaligned_be64(hmac); pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", subflow, subflow->token, thmac, subflow->thmac); return thmac == subflow->thmac; } void mptcp_subflow_reset(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; /* mptcp_mp_fail_no_response() can reach here on an already closed * socket */ if (ssk->sk_state == TCP_CLOSE) return; /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); mptcp_send_active_reset_reason(ssk); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); sock_put(sk); } static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } void __mptcp_sync_state(struct sock *sk, int state) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk = msk->first; subflow = mptcp_subflow_ctx(ssk); __mptcp_propagate_sndbuf(sk, ssk); if (!msk->rcvspace_init) mptcp_rcv_space_init(msk, ssk); if (sk->sk_state == TCP_SYN_SENT) { /* subflow->idsn is always available is TCP_SYN_SENT state, * even for the FASTOPEN scenarios */ WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); mptcp_set_state(sk, state); sk->sk_state_change(sk); } } static void subflow_set_remote_key(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { /* active MPC subflow will reach here multiple times: * at subflow_finish_connect() time and at 4th ack time */ if (subflow->remote_key_valid) return; subflow->remote_key_valid = 1; subflow->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); subflow->iasn++; WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->ack_seq, subflow->iasn); WRITE_ONCE(msk->can_ack, true); atomic64_set(&msk->rcv_wnd_sent, subflow->iasn); } static void mptcp_propagate_state(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = mptcp_sk(sk); mptcp_data_lock(sk); if (mp_opt) { /* Options are available only in the non fallback cases * avoid updating rx path fields otherwise */ WRITE_ONCE(msk->snd_una, subflow->idsn + 1); WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd); subflow_set_remote_key(msk, subflow, mp_opt); } if (!sock_owned_by_user(sk)) { __mptcp_sync_state(sk, ssk->sk_state); } else { msk->pending_state = ssk->sk_state; __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags); } mptcp_data_unlock(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; struct mptcp_sock *msk; subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) return; msk = mptcp_sk(parent); subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x\n", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); mptcp_do_fallback(sk); pr_fallback(msk); goto fallback; } if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); if (mp_opt.deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); mptcp_active_enable(parent); mptcp_propagate_state(parent, sk, subflow, &mp_opt); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) { subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; WRITE_ONCE(subflow->remote_id, mp_opt.join_id); pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } if (!mptcp_finish_join(sk)) goto do_reset; subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, hmac); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); if (subflow->backup) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); if (subflow_use_different_dport(msk, sk)) { pr_debug("synack inet_dport=%d %d\n", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); } } else if (mptcp_check_fallback(sk)) { /* It looks like MPTCP is blocked, while TCP is not */ if (subflow->mpc_drop) mptcp_active_disable(parent); fallback: mptcp_propagate_state(parent, sk, subflow, NULL); } return; do_reset: subflow->reset_transient = 0; mptcp_subflow_reset(sk); } static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) { WARN_ON_ONCE(local_id < 0 || local_id > 255); WRITE_ONCE(subflow->local_id, local_id); } static int subflow_chk_local_id(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); int err; if (likely(subflow->local_id >= 0)) return 0; err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); if (err < 0) return err; subflow_set_local_id(subflow, err); subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk); return 0; } static int subflow_rebuild_header(struct sock *sk) { int err = subflow_chk_local_id(sk); if (unlikely(err < 0)) return err; return inet_sk_rebuild_header(sk); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static int subflow_v6_rebuild_header(struct sock *sk) { int err = subflow_chk_local_id(sk); if (unlikely(err < 0)) return err; return inet6_sk_rebuild_header(sk); } #endif static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p\n", subflow); /* Never answer to SYNs sent to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: tcp_listendrop(sk); return 0; } static void subflow_v4_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp_request_sock_ops.destructor(req); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; static struct proto tcpv6_prot_override __ro_after_init; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p\n", subflow); if (skb->protocol == htons(ETH_P_IP)) return subflow_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void subflow_v6_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp6_request_sock_ops.destructor(req); } #endif struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { if (ops->family == AF_INET) ops = &mptcp_subflow_v4_request_sock_ops; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ops->family == AF_INET6) ops = &mptcp_subflow_v6_request_sock_ops; #endif return inet_reqsk_alloc(ops, sk_listener, attach_listener); } EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ static bool subflow_hmac_valid(const struct request_sock *req, const struct mptcp_options_received *mp_opt) { const struct mptcp_subflow_request_sock *subflow_req; u8 hmac[SHA256_DIGEST_SIZE]; struct mptcp_sock *msk; subflow_req = mptcp_subflow_rsk(req); msk = subflow_req->msk; if (!msk) return false; subflow_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), subflow_req->remote_nonce, subflow_req->local_nonce, hmac); return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } static void subflow_ulp_fallback(struct sock *sk, struct mptcp_subflow_context *old_ctx) { struct inet_connection_sock *icsk = inet_csk(sk); mptcp_subflow_tcp_fallback(sk, old_ctx); icsk->icsk_ulp_ops = NULL; rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tcp_sk(sk)->is_mptcp = 0; mptcp_subflow_ops_undo_override(sk); } void mptcp_subflow_drop_ctx(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); if (!ctx) return; list_del(&mptcp_subflow_ctx(ssk)->node); if (inet_csk(ssk)->icsk_ulp_ops) { subflow_ulp_fallback(ssk, ctx); if (ctx->conn) sock_put(ctx->conn); } kfree_rcu(ctx, rcu); } void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { subflow_set_remote_key(msk, subflow, mp_opt); WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); if (subflow->is_mptfo) __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; enum sk_rst_reason reason; struct mptcp_sock *owner; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p\n", listener, req, listener->conn); /* After child creation we must look for MPC even when options * are not parsed */ mp_opt.suboptions = 0; /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; fallback = !tcp_rsk(req)->is_mptcp; if (fallback) goto create_child; /* if the sk is MP_CAPABLE, we try to fetch the client key */ if (subflow_req->mp_capable) { /* we can receive and accept an in-window, out-of-order pkt, * which may not carry the MP_CAPABLE opt even on mptcp enabled * paths: always try to extract the peer key, and fallback * for packets missing it. * Even OoO DSS packets coming legitly after dropped or * reordered MPC will cause fallback, but we don't have other * options. */ mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK))) fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); fallback = true; } } create_child: child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); tcp_rsk(req)->drop_req = false; /* we need to fallback on ctx allocation failure and on pre-reqs * checking above. In the latter scenario we additionally need * to reset the context to non MPTCP status. */ if (!ctx || fallback) { if (fallback_is_fatal) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } goto fallback; } /* ssk inherits options of listener sk */ ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); if (!ctx->conn) goto fallback; ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress * mpc option */ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { mptcp_pm_fully_established(owner, child); ctx->pm_notified = 1; } } else if (ctx->mp_join) { owner = subflow_req->msk; if (!owner) { subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; if (subflow_use_different_sport(owner, sk)) { pr_debug("ack inet_sport=%d %d\n", ntohs(inet_sk(sk)->inet_sport), ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); } if (!mptcp_finish_join(child)) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(child); subflow_add_reset_reason(skb, subflow->reset_reason); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; } } /* check for expected invariant - should never trigger, just help * catching earlier subtle bugs */ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && (!mptcp_subflow_ctx(child) || !mptcp_subflow_ctx(child)->conn)); return child; dispose_child: mptcp_subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); reason = mptcp_get_rst_reason(skb); req->rsk_ops->send_reset(sk, skb, reason); /* The last child reference will be released by the caller */ return child; fallback: if (fallback) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); mptcp_subflow_drop_ctx(child); return child; } static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; static struct proto tcp_prot_override __ro_after_init; enum mapping_status { MAPPING_OK, MAPPING_INVALID, MAPPING_EMPTY, MAPPING_DATA_FIN, MAPPING_DUMMY, MAPPING_BAD_CSUM, MAPPING_NODSS }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n", ssn, subflow->map_subflow_seq, subflow->map_data_len); } static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned int skb_consumed; skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; if (unlikely(skb_consumed >= skb->len)) { DEBUG_NET_WARN_ON_ONCE(1); return true; } return skb->len - skb_consumed <= subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); } static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; if (unlikely(before(ssn, subflow->map_subflow_seq))) { /* Mapping covers data later in the subflow stream, * currently unsupported. */ dbg_bad_map(subflow, ssn); return false; } if (unlikely(!before(ssn, subflow->map_subflow_seq + subflow->map_data_len))) { /* Mapping does covers past subflow data, invalid */ dbg_bad_map(subflow, ssn); return false; } return true; } static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, bool csum_reqd) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 offset, seq, delta; __sum16 csum; int len; if (!csum_reqd) return MAPPING_OK; /* mapping already validated on previous traversal */ if (subflow->map_csum_len == subflow->map_data_len) return MAPPING_OK; /* traverse the receive queue, ensuring it contains a full * DSS mapping and accumulating the related csum. * Preserve the accoumlate csum across multiple calls, to compute * the csum only once */ delta = subflow->map_data_len - subflow->map_csum_len; for (;;) { seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; offset = seq - TCP_SKB_CB(skb)->seq; /* if the current skb has not been accounted yet, csum its contents * up to the amount covered by the current DSS */ if (offset < skb->len) { __wsum csum; len = min(skb->len - offset, delta); csum = skb_checksum(skb, offset, len, 0); subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, subflow->map_csum_len); delta -= len; subflow->map_csum_len += len; } if (delta == 0) break; if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { /* if this subflow is closed, the partial mapping * will be never completed; flush the pending skbs, so * that subflow_sched_work_if_closed() can kick in */ if (unlikely(ssk->sk_state == TCP_CLOSE)) while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); /* not enough data to validate the csum */ return MAPPING_EMPTY; } /* the DSS mapping for next skbs will be validated later, * when a get_mapping_status call will process such skb */ skb = skb->next; } /* note that 'map_data_len' accounts only for the carried data, does * not include the eventual seq increment due to the data fin, * while the pseudo header requires the original DSS data len, * including that */ csum = __mptcp_make_csum(subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len + subflow->map_data_fin, subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); return MAPPING_BAD_CSUM; } subflow->valid_csum_seen = 1; return MAPPING_OK; } static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; u64 map_seq; skb = skb_peek(&ssk->sk_receive_queue); if (!skb) return MAPPING_EMPTY; if (mptcp_check_fallback(ssk)) return MAPPING_DUMMY; mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { /* the TCP stack deliver 0 len FIN pkt to the receive * queue, that is the only 0len pkts ever expected here, * and we can admit no mapping only for 0 len pkts */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) WARN_ONCE(1, "0len seq %d:%d flags %x", TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, TCP_SKB_CB(skb)->tcp_flags); sk_eat_skb(ssk, skb); return MAPPING_EMPTY; } /* If the required DSS has likely been dropped by a middlebox */ if (!subflow->map_valid) return MAPPING_NODSS; goto validate_seq; } trace_get_mapping_status(mpext); data_len = mpext->data_len; if (data_len == 0) { pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); subflow->map_data_len = 0; return MAPPING_INVALID; } if (mpext->data_fin == 1) { u64 data_fin_seq; if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping * has been fully consumed. Continue * handling the existing mapping. */ skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; } if (updated) mptcp_schedule_work((struct sock *)msk); return MAPPING_DATA_FIN; } data_fin_seq = mpext->data_seq + data_len - 1; /* If mpext->data_seq is a 32-bit value, data_fin_seq must also * be limited to 32 bits. */ if (!mpext->dsn64) data_fin_seq &= GENMASK_ULL(31, 0); mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n", data_fin_seq, mpext->dsn64); /* Adjust for DATA_FIN using 1 byte of sequence space */ data_len--; } map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && subflow->map_data_len == data_len && subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); goto validate_csum; } /* If this skb data are fully covered by the current mapping, * the new map would need caching, which is not supported */ if (skb_is_fully_mapped(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); return MAPPING_INVALID; } /* will validate the next map after consuming the current one */ goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; subflow->map_csum_reqd = mpext->csum_reqd; subflow->map_csum_len = 0; subflow->map_data_csum = csum_unfold(mpext->csum); /* Cfr RFC 8684 Section 3.3.0 */ if (unlikely(subflow->map_csum_reqd != csum_reqd)) return MAPPING_INVALID; pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len, subflow->map_csum_reqd, subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping */ if (!validate_mapping(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID; } skb_ext_del(skb, SKB_EXT_MPTCP); validate_csum: return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, u64 limit) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; struct tcp_sock *tp = tcp_sk(ssk); u32 offset, incr, avail_len; offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; if (WARN_ON_ONCE(offset > skb->len)) goto out; avail_len = skb->len - offset; incr = limit >= avail_len ? avail_len + fin : limit; pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr, skb->len, offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; out: if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; } /* sched mptcp worker to remove the subflow if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; if (likely(ssk->sk_state != TCP_CLOSE && (ssk->sk_state != TCP_CLOSE_WAIT || inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; if (skb_queue_empty(&ssk->sk_receive_queue) && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) mptcp_schedule_work(sk); } static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) { struct mptcp_sock *msk = mptcp_sk(subflow->conn); if (subflow->mp_join) return false; else if (READ_ONCE(msk->csum_enabled)) return !subflow->valid_csum_seen; else return READ_ONCE(msk->allow_infinite_fallback); } static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned long fail_tout; /* graceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) return; /* since the close timeout take precedence on the fail one, * no need to start the latter when the first is already set */ if (sock_flag((struct sock *)msk, SOCK_DEAD)) return; /* we don't need extreme accuracy here, use a zero fail_tout as special * value meaning no fail timeout at all; */ fail_tout = jiffies + TCP_RTO_MAX; if (!fail_tout) fail_tout = 1; WRITE_ONCE(subflow->fail_tout, fail_tout); tcp_send_ack(ssk); mptcp_reset_tout_timer(msk, subflow->fail_tout); } static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); enum mapping_status status; struct mptcp_sock *msk; struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) WRITE_ONCE(subflow->data_avail, false); if (subflow->data_avail) return true; msk = mptcp_sk(subflow->conn); for (;;) { u64 ack_seq; u64 old_ack; status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || status == MAPPING_BAD_CSUM || status == MAPPING_NODSS)) goto fallback; if (status != MAPPING_OK) goto no_data; skb = skb_peek(&ssk->sk_receive_queue); if (WARN_ON_ONCE(!skb)) goto no_data; if (unlikely(!READ_ONCE(msk->can_ack))) goto fallback; old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack, ack_seq); if (unlikely(before64(ack_seq, old_ack))) { mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); continue; } WRITE_ONCE(subflow->data_avail, true); break; } return true; no_data: subflow_sched_work_if_closed(msk, ssk); return false; fallback: if (!__mptcp_check_fallback(msk)) { /* RFC 8684 section 3.7. */ if (status == MAPPING_BAD_CSUM && (subflow->mp_join || subflow->valid_csum_seen)) { subflow->send_mp_fail = 1; if (!READ_ONCE(msk->allow_infinite_fallback)) { subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; goto reset; } mptcp_subflow_fail(msk, ssk); WRITE_ONCE(subflow->data_avail, true); return true; } if (!subflow_can_fallback(subflow) && subflow->map_data_len) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ subflow->reset_transient = 0; subflow->reset_reason = status == MAPPING_NODSS ? MPTCP_RST_EMIDDLEBOX : MPTCP_RST_EMPTCP; reset: WRITE_ONCE(ssk->sk_err, EBADMSG); tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); mptcp_send_active_reset_reason(ssk); WRITE_ONCE(subflow->data_avail, false); return false; } mptcp_do_fallback(ssk); } skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; WRITE_ONCE(subflow->data_avail, true); return true; } bool mptcp_subflow_data_available(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* check if current mapping is still valid */ if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; WRITE_ONCE(subflow->data_avail, false); pr_debug("Done with mapping: seq=%u data_len=%u\n", subflow->map_subflow_seq, subflow->map_data_len); } return subflow_check_data_avail(sk); } /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, * not the ssk one. * * In mptcp, rwin is about the mptcp-level connection data. * * Data that is still on the ssk rx queue can thus be ignored, * as far as mptcp peer is concerned that data is still inflight. * DSS ACK is updated when skb is moved to the mptcp rx queue. */ void mptcp_space(const struct sock *ssk, int *space, int *full_space) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; *space = __mptcp_space(sk); *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } static void subflow_error_report(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; /* bail early if this is a no-op, so that we avoid introducing a * problematic lockdep dependency between TCP accept queue lock * and msk socket spinlock */ if (!sk->sk_socket) return; mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); u16 state = 1 << inet_sk_state_load(sk); struct sock *parent = subflow->conn; struct mptcp_sock *msk; trace_sk_data_ready(sk); msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { /* MPJ subflow are removed from accept queue before reaching here, * avoid stray wakeups */ if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) return; parent->sk_data_ready(parent); return; } WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && !subflow->mp_join && !(state & TCPF_CLOSE)); if (mptcp_subflow_data_available(sk)) { mptcp_data_ready(parent, sk); /* subflow-level lowat test are not relevant. * respect the msk-level threshold eventually mandating an immediate ack */ if (mptcp_data_avail(msk) < parent->sk_rcvlowat && (tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } else if (unlikely(sk->sk_err)) { subflow_error_report(sk); } } static void subflow_write_space(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; mptcp_propagate_sndbuf(sk, ssk); mptcp_write_space(sk); } static const struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (sk->sk_family == AF_INET6) return &subflow_v6_specific; #endif return &subflow_specific; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock_af_ops *target; target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n", subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) return; subflow->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = target; } #endif void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family) { memset(addr, 0, sizeof(*addr)); addr->ss_family = family; if (addr->ss_family == AF_INET) { struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; if (info->family == AF_INET) in_addr->sin_addr = info->addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ipv6_addr_v4mapped(&info->addr6)) in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3]; #endif in_addr->sin_port = info->port; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; if (info->family == AF_INET) ipv6_addr_set_v4mapped(info->addr.s_addr, &in6_addr->sin6_addr); else in6_addr->sin6_addr = info->addr6; in6_addr->sin6_port = info->port; } #endif } int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; int local_id = local->addr.id; struct sockaddr_storage addr; int remote_id = remote->id; int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; /* The userspace PM sent the request too early? */ if (!mptcp_is_fully_established(sk)) goto err_out; err = mptcp_subflow_create_socket(sk, local->addr.family, &sf); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCREATSKERR); pr_debug("msk=%p local=%d remote=%d create sock error: %d\n", msk, local_id, remote_id, err); goto err_out; } ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); do { get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); /* if 'IPADDRANY', the ID will be set later, after the routing */ if (local->addr.family == AF_INET) { if (!local->addr.addr.s_addr) local_id = -1; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (sk->sk_family == AF_INET6) { if (ipv6_addr_any(&local->addr.addr6)) local_id = -1; #endif } if (local_id >= 0) subflow_set_local_id(subflow, local_id); subflow->remote_key_valid = 1; subflow->remote_key = READ_ONCE(msk->remote_key); subflow->local_key = READ_ONCE(msk->local_key); subflow->token = msk->token; mptcp_info2sockaddr(&local->addr, &addr, ssk->sk_family); addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif ssk->sk_bound_dev_if = local->ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR); pr_debug("msk=%p local=%d remote=%d bind error: %d\n", msk, local_id, remote_id, err); goto failed; } mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; WRITE_ONCE(subflow->remote_id, remote_id); subflow->request_join = 1; subflow->request_bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); subflow->subflow_id = msk->subflow_id++; mptcp_info2sockaddr(remote, &addr, ssk->sk_family); sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR); pr_debug("msk=%p local=%d remote=%d connect error: %d\n", msk, local_id, remote_id, err); goto failed_unlink; } MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTX); /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); WRITE_ONCE(msk->allow_infinite_fallback, false); mptcp_stop_tout_timer(sk); return 0; failed_unlink: list_del(&subflow->node); sock_put(mptcp_subflow_tcp_sock(subflow)); failed: subflow->disposable = 1; sock_release(sf); err_out: /* we account subflows before the creation, and this failures will not * be caught by sk_state_change() */ mptcp_pm_close_subflow(msk); return err; } static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) { #ifdef CONFIG_SOCK_CGROUP_DATA struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, *child_skcd = &child->sk_cgrp_data; /* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != cgroup_id(sock_cgroup_ptr(child_skcd))) { #ifdef CONFIG_MEMCG struct mem_cgroup *memcg = parent->sk_memcg; mem_cgroup_sk_free(child); if (memcg && css_tryget(&memcg->css)) child->sk_memcg = memcg; #endif /* CONFIG_MEMCG */ cgroup_sk_free(child_skcd); *child_skcd = *parent_skcd; cgroup_sk_clone(child_skcd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ } static void mptcp_subflow_ops_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot) ssk->sk_prot = &tcpv6_prot_override; else #endif ssk->sk_prot = &tcp_prot_override; } static void mptcp_subflow_ops_undo_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot_override) ssk->sk_prot = &tcpv6_prot; else #endif ssk->sk_prot = &tcp_prot; } int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock) { struct mptcp_subflow_context *subflow; struct net *net = sock_net(sk); struct socket *sf; int err; /* un-accepted server sockets can reach here - on bad configuration * bail early to avoid greater trouble later */ if (unlikely(!sk->sk_socket)) return -EINVAL; err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf); if (err) return err; lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); err = security_mptcp_add_subflow(sk, sf->sk); if (err) goto err_free; /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ __netns_tracker_free(net, &sf->sk->ns_tracker, false); sf->sk->sk_net_refcnt = 1; get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); sock_inuse_add(net, 1); err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free; mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); release_sock(sf->sk); /* the newly created socket really belongs to the owning MPTCP * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interfaces really show this one belonging to the correct * user. */ SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; subflow = mptcp_subflow_ctx(sf->sk); pr_debug("subflow=%p\n", subflow); *new_sock = sf; sock_hold(sk); subflow->conn = sk; mptcp_subflow_ops_override(sf->sk); return 0; err_free: release_sock(sf->sk); sock_release(sf); return err; } static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; ctx = kzalloc(sizeof(*ctx), priority); if (!ctx) return NULL; rcu_assign_pointer(icsk->icsk_ulp_data, ctx); INIT_LIST_HEAD(&ctx->node); INIT_LIST_HEAD(&ctx->delegated_node); pr_debug("subflow=%p\n", ctx); ctx->tcp_sock = sk; WRITE_ONCE(ctx->local_id, -1); return ctx; } static void __subflow_state_change(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static bool subflow_is_done(const struct sock *sk) { return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; } static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; struct mptcp_sock *msk; __subflow_state_change(sk); msk = mptcp_sk(parent); if (subflow_simultaneous_connect(sk)) { mptcp_do_fallback(sk); pr_fallback(msk); subflow->conn_finished = 1; mptcp_propagate_state(parent, sk, subflow, NULL); } /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. */ if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); else if (unlikely(sk->sk_err)) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); /* when the fallback subflow closes the rx side, trigger a 'dummy' * ingress data fin, so that the msk state will follow along */ if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) { struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct request_sock *req, *head, *tail; struct mptcp_subflow_context *subflow; struct sock *sk, *ssk; /* Due to lock dependencies no relevant lock can be acquired under rskq_lock. * Splice the req list, so that accept() can not reach the pending ssk after * the listener socket is released below. */ spin_lock_bh(&queue->rskq_lock); head = queue->rskq_accept_head; tail = queue->rskq_accept_tail; queue->rskq_accept_head = NULL; queue->rskq_accept_tail = NULL; spin_unlock_bh(&queue->rskq_lock); if (!head) return; /* can't acquire the msk socket lock under the subflow one, * or will cause ABBA deadlock */ release_sock(listener_ssk); for (req = head; req; req = req->dl_next) { ssk = req->sk; if (!sk_is_mptcp(ssk)) continue; subflow = mptcp_subflow_ctx(ssk); if (!subflow || !subflow->conn) continue; sk = subflow->conn; sock_hold(sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); __mptcp_unaccepted_force_close(sk); release_sock(sk); /* lockdep will report a false positive ABBA deadlock * between cancel_work_sync and the listener socket. * The involved locks belong to different sockets WRT * the existing AB chain. * Using a per socket key is problematic as key * deregistration requires process context and must be * performed at socket disposal time, in atomic * context. * Just tell lockdep to consider the listener socket * released here. */ mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); mptcp_cancel_work(sk); mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); sock_put(sk); } /* we are still under the listener msk socket lock */ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); /* restore the listener queue, to let the TCP code clean it up */ spin_lock_bh(&queue->rskq_lock); WARN_ON_ONCE(queue->rskq_accept_head); queue->rskq_accept_head = head; queue->rskq_accept_tail = tail; spin_unlock_bh(&queue->rskq_lock); } static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; struct tcp_sock *tp = tcp_sk(sk); int err = 0; /* disallow attaching ULP to a socket unless it has been * created with sock_create_kern() */ if (!sk->sk_kern_sock) { err = -EOPNOTSUPP; goto out; } ctx = subflow_create_ctx(sk, GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto out; } pr_debug("subflow=%p, family=%d\n", ctx, sk->sk_family); tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = subflow_default_af_ops(sk); ctx->tcp_state_change = sk->sk_state_change; ctx->tcp_error_report = sk->sk_error_report; WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable); WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space); sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; sk->sk_error_report = subflow_error_report; out: return err; } static void subflow_ulp_release(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); bool release = true; struct sock *sk; if (!ctx) return; sk = ctx->conn; if (sk) { /* if the msk has been orphaned, keep the ctx * alive, will be freed by __mptcp_close_ssk(), * when the subflow is still unaccepted */ release = ctx->disposable || list_empty(&ctx->node); /* inet_child_forget() does not call sk_state_change(), * explicitly trigger the socket close machinery */ if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); sock_put(sk); } mptcp_subflow_ops_undo_override(ssk); if (release) kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); struct mptcp_subflow_context *new_ctx; if (!tcp_rsk(req)->is_mptcp || (!subflow_req->mp_capable && !subflow_req->mp_join)) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx = subflow_create_ctx(newsk, priority); if (!new_ctx) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; if (subflow_req->mp_capable) { /* see comments in subflow_syn_recv_sock(), MPTCP connection * is fully established only after we receive the remote key */ new_ctx->mp_capable = 1; new_ctx->local_key = subflow_req->local_key; new_ctx->token = subflow_req->token; new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->idsn = subflow_req->idsn; /* this is the first subflow, id is always 0 */ subflow_set_local_id(new_ctx, 0); } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; WRITE_ONCE(new_ctx->fully_established, true); new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; new_ctx->request_bkup = subflow_req->request_bkup; WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; /* the subflow req id is valid, fetched via subflow_check_req() * and subflow_token_join_request() */ subflow_set_local_id(new_ctx, subflow_req->local_id); } } static void tcp_release_cb_override(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); long status; /* process and clear all the pending actions, but leave the subflow into * the napi queue. To respect locking, only the same CPU that originated * the action can touch the list. mptcp_napi_poll will take care of it. */ status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0); if (status) mptcp_subflow_process_delegated(ssk, status); tcp_release_cb(ssk); } static int tcp_abort_override(struct sock *ssk, int err) { /* closing a listener subflow requires a great deal of care. * keep it simple and just prevent such operation */ if (inet_sk_state_load(ssk) == TCP_LISTEN) return -EINVAL; return tcp_abort(ssk, err); } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, .init = subflow_ulp_init, .release = subflow_ulp_release, .clone = subflow_ulp_clone, }; static int subflow_ops_init(struct request_sock_ops *subflow_ops) { subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, subflow_ops->obj_size, 0, SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, NULL); if (!subflow_ops->slab) return -ENOMEM; return 0; } void __init mptcp_subflow_init(void) { mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v4 request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; subflow_specific.rebuild_header = subflow_rebuild_header; tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; tcp_prot_override.diag_destroy = tcp_abort_override; #if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock * structures for v4 and v6 have the same size. It should not changed in * the future but better to make sure to be warned if it is no longer * the case. */ BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v6 request sock ops\n"); subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; subflow_v6m_specific.send_check = ipv4_specific.send_check; subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.rebuild_header = subflow_rebuild_header; tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; tcpv6_prot_override.diag_destroy = tcp_abort_override; #endif mptcp_diag_subflow_init(&subflow_ulp_ops); if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); }
504 504 506 506 506 506 130 504 504 504 217 83 128 128 128 109 16 47 130 15 128 130 130 130 83 83 83 83 504 485 504 506 130 506 506 505 2 506 506 505 506 506 506 68 492 506 2 505 506 506 506 506 505 506 506 142 503 504 504 504 506 142 474 83 130 506 504 142 506 504 142 506 130 506 129 3 506 506 506 506 506 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 // SPDX-License-Identifier: GPL-2.0 /* * Generic ring buffer * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ #include <linux/trace_recursion.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> #include <linux/sched/clock.h> #include <linux/cacheflush.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> #include <linux/security.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kthread.h> /* for self test */ #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/list.h> #include <linux/cpu.h> #include <linux/oom.h> #include <linux/mm.h> #include <asm/local64.h> #include <asm/local.h> #include "trace.h" /* * The "absolute" timestamp in the buffer is only 59 bits. * If a clock has the 5 MSBs set, it needs to be saved and * reinserted. */ #define TS_MSB (0xf8ULL << 56) #define ABS_TS_MASK (~TS_MSB) static void update_pages_handler(struct work_struct *work); #define RING_BUFFER_META_MAGIC 0xBADFEED struct ring_buffer_meta { int magic; int struct_size; unsigned long text_addr; unsigned long data_addr; unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; __u32 subbuf_size; __u32 nr_subbufs; int buffers[]; }; /* * The ring buffer header is special. We must manually up keep it. */ int ring_buffer_print_entry_header(struct trace_seq *s) { trace_seq_puts(s, "# compressed entry header\n"); trace_seq_puts(s, "\ttype_len : 5 bits\n"); trace_seq_puts(s, "\ttime_delta : 27 bits\n"); trace_seq_puts(s, "\tarray : 32 bits\n"); trace_seq_putc(s, '\n'); trace_seq_printf(s, "\tpadding : type == %d\n", RINGBUF_TYPE_PADDING); trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); trace_seq_printf(s, "\ttime_stamp : type == %d\n", RINGBUF_TYPE_TIME_STAMP); trace_seq_printf(s, "\tdata max type_len == %d\n", RINGBUF_TYPE_DATA_TYPE_LEN_MAX); return !trace_seq_has_overflowed(s); } /* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is * associated with the CPU it is currently executing on. A reader may read * from any per cpu buffer. * * The reader is special. For each per cpu buffer, the reader has its own * reader page. When a reader has read the entire reader page, this reader * page is swapped with another page in the ring buffer. * * Now, as long as the writer is off the reader page, the reader can do what * ever it wants with that page. The writer will never write to that page * again (as long as it is out of the ring buffer). * * Here's some silly ASCII art. * * +------+ * |reader| RING BUFFER * |page | * +------+ +---+ +---+ +---+ * | |-->| |-->| | * +---+ +---+ +---+ * ^ | * | | * +---------------+ * * * +------+ * |reader| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * | |-->| |-->| | * +---+ +---+ +---+ * ^ | * | | * +---------------+ * * * +------+ * |reader| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * ^ | |-->| |-->| | * | +---+ +---+ +---+ * | | * | | * +------------------------------+ * * * +------+ * |buffer| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * ^ | | | |-->| | * | New +---+ +---+ +---+ * | Reader------^ | * | page | * +------------------------------+ * * * After we make this swap, the reader can hand this page off to the splice * code and be done with it. It can even allocate a new page if it needs to * and swap that into the ring buffer. * * We will be using cmpxchg soon to make all this lockless. * */ /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ #ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS # define RB_FORCE_8BYTE_ALIGNMENT 0 # define RB_ARCH_ALIGNMENT RB_ALIGNMENT #else # define RB_FORCE_8BYTE_ALIGNMENT 1 # define RB_ARCH_ALIGNMENT 8U #endif #define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, RB_LEN_TIME_STAMP = 8, }; #define skip_time_extend(event) \ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) #define extended_time(event) \ (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) static inline bool rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { /* padding has a NULL time_delta */ event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } static unsigned rb_event_data_length(struct ring_buffer_event *event) { unsigned length; if (event->type_len) length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; } /* * Return the length of the given event. Will return * the length of the time extend if the event is a * time extend. */ static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; case RINGBUF_TYPE_TIME_STAMP: return RB_LEN_TIME_STAMP; case RINGBUF_TYPE_DATA: return rb_event_data_length(event); default: WARN_ON_ONCE(1); } /* not hit */ return 0; } /* * Return total length of time extend and data, * or just the event length for all other events. */ static inline unsigned rb_event_ts_length(struct ring_buffer_event *event) { unsigned len = 0; if (extended_time(event)) { /* time extends include the data event after it */ len = RB_LEN_TIME_EXTEND; event = skip_time_extend(event); } return len + rb_event_length(event); } /** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of * * Returns the size of the data load of a data event. * If the event is something other than a data event, it * returns the size of the event itself. With the exception * of a TIME EXTEND, where it still returns the size of the * data load of the data event after it. */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length; if (extended_time(event)) event = skip_time_extend(event); length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) length -= sizeof(event->array[0]); return length; } EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ static __always_inline void * rb_event_data(struct ring_buffer_event *event) { if (extended_time(event)) event = skip_time_extend(event); WARN_ON_ONCE(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; } /** * ring_buffer_event_data - return the data of the event * @event: the event to get the data from */ void *ring_buffer_event_data(struct ring_buffer_event *event) { return rb_event_data(event); } EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ for_each_cpu(cpu, buffer->cpumask) #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; ts = event->array[0]; ts <<= TS_SHIFT; ts += event->time_delta; return ts; } /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) #define RB_MISSED_MASK (3 << 30) struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ }; /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer * page will be at the beginning of a cache line, and thus * the least significant bits will be zero. We use this to * add flags in the list struct pointers, to make the ring buffer * lockless. */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ u32 id:30; /* ID for external mapping */ u32 range:1; /* Mapped via a range */ struct buffer_data_page *page; /* Actual data page */ }; /* * The buffer page counters, write and entries, must be reset * atomically when crossing page boundaries. To synchronize this * update, two counters are inserted into the number. One is * the actual counter for the write position or count on the page. * * The other is a counter of updaters. Before an update happens * the update partition of the counter is incremented. This will * allow the updater to update the counter atomically. * * The counter is 20 bits, and the state data is 12. */ #define RB_WRITE_MASK 0xfffff #define RB_WRITE_INTCNT (1 << 20) static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); } static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) { return local_read(&bpage->page->commit); } static void free_buffer_page(struct buffer_page *bpage) { /* Range pages are not to be freed */ if (!bpage->range) free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } /* * We need to fit the time_stamp delta into 27 bits. */ static inline bool test_time_stamp(u64 delta) { return !!(delta & TS_DELTA_TEST); } struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; atomic_t seq; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; }; /* * Structure to hold event state and handle nested events. */ struct rb_event_info { u64 ts; u64 delta; u64 before; u64 after; unsigned long length; struct buffer_page *tail_page; int add_timestamp; }; /* * Used for the add_timestamp * NONE * EXTEND - wants a time extend * ABSOLUTE - the buffer requests all events to have absolute time stamps * FORCE - force a full time stamp. */ enum { RB_ADD_STAMP_NONE = 0, RB_ADD_STAMP_EXTEND = BIT(1), RB_ADD_STAMP_ABSOLUTE = BIT(2), RB_ADD_STAMP_FORCE = BIT(3) }; /* * Used for which event context the event is in. * TRANSITION = 0 * NMI = 1 * IRQ = 2 * SOFTIRQ = 3 * NORMAL = 4 * * See trace_recursive_lock() comment below for more details. */ enum { RB_CTX_TRANSITION, RB_CTX_NMI, RB_CTX_IRQ, RB_CTX_SOFTIRQ, RB_CTX_NORMAL, RB_CTX_MAX }; struct rb_time_struct { local64_t time; }; typedef struct rb_time_struct rb_time_t; #define MAX_NEST 5 /* * head_page == tail_page && head == tail then buffer is empty. */ struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; atomic_t resize_disabled; struct trace_buffer *buffer; raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct buffer_data_page *free_page; unsigned long nr_pages; unsigned int current_context; struct list_head *pages; /* pages generation counter, incremented when the list changes */ unsigned long cnt; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; unsigned long nest; local_t entries_bytes; local_t entries; local_t overrun; local_t commit_overrun; local_t dropped_events; local_t committing; local_t commits; local_t pages_touched; local_t pages_lost; local_t pages_read; long last_pages_touch; size_t shortest_full; unsigned long read; unsigned long read_bytes; rb_time_t write_stamp; rb_time_t before_stamp; u64 event_stamp[MAX_NEST]; u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; unsigned int mapped; unsigned int user_mapped; /* user space mapping */ struct mutex mapping_lock; unsigned long *subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; struct ring_buffer_meta *ring_meta; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; struct rb_irq_work irq_work; }; struct trace_buffer { unsigned flags; int cpus; atomic_t record_disabled; atomic_t resizing; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; struct mutex mutex; struct ring_buffer_per_cpu **buffers; struct hlist_node node; u64 (*clock)(void); struct rb_irq_work irq_work; bool time_stamp_abs; unsigned long range_addr_start; unsigned long range_addr_end; long last_text_delta; long last_data_delta; unsigned int subbuf_size; unsigned int subbuf_order; unsigned int max_data_size; }; struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; unsigned long next_event; struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; unsigned long cache_pages_removed; u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; size_t event_size; int missed_events; }; int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s) { struct buffer_data_page field; trace_seq_printf(s, "\tfield: u64 timestamp;\t" "offset:0;\tsize:%u;\tsigned:%u;\n", (unsigned int)sizeof(field.time_stamp), (unsigned int)is_signed_type(u64)); trace_seq_printf(s, "\tfield: local_t commit;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); trace_seq_printf(s, "\tfield: int overwrite;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), 1, (unsigned int)is_signed_type(long)); trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), (unsigned int)buffer->subbuf_size, (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); } static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); } static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } /* * Enable this to make sure that the event passed to * ring_buffer_event_time_stamp() is not committed and also * is on the buffer that it passed in. */ //#define RB_VERIFY_EVENT #ifdef RB_VERIFY_EVENT static struct list_head *rb_list_head(struct list_head *list); static void verify_event(struct ring_buffer_per_cpu *cpu_buffer, void *event) { struct buffer_page *page = cpu_buffer->commit_page; struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page); struct list_head *next; long commit, write; unsigned long addr = (unsigned long)event; bool done = false; int stop = 0; /* Make sure the event exists and is not committed yet */ do { if (page == tail_page || WARN_ON_ONCE(stop++ > 100)) done = true; commit = local_read(&page->page->commit); write = local_read(&page->write); if (addr >= (unsigned long)&page->page->data[commit] && addr < (unsigned long)&page->page->data[write]) return; next = rb_list_head(page->list.next); page = list_entry(next, struct buffer_page, list); } while (!done); WARN_ON_ONCE(1); } #else static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, void *event) { } #endif /* * The absolute time stamp drops the 5 MSBs and some clocks may * require them. The rb_fix_abs_ts() will take a previous full * time stamp, and add the 5 MSB of that time stamp on to the * saved absolute time stamp. Then they are compared in case of * the unlikely event that the latest time stamp incremented * the 5 MSB. */ static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts) { if (save_ts & TS_MSB) { abs |= save_ts & TS_MSB; /* Check for overflow */ if (unlikely(abs < save_ts)) abs += 1ULL << 59; } return abs; } static inline u64 rb_time_stamp(struct trace_buffer *buffer); /** * ring_buffer_event_time_stamp - return the event's current time stamp * @buffer: The buffer that the event is on * @event: the event to get the time stamp of * * Note, this must be called after @event is reserved, and before it is * committed to the ring buffer. And must be called from the same * context where the event was reserved (normal, softirq, irq, etc). * * Returns the time stamp associated with the current event. * If the event has an extended time stamp, then that is used as * the time stamp to return. * In the highly unlikely case that the event was nested more than * the max nesting, then the write_stamp of the buffer is returned, * otherwise current time is returned, but that really neither of * the last two cases should ever happen. */ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()]; unsigned int nest; u64 ts; /* If the event includes an absolute time, then just use that */ if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { ts = rb_event_time_stamp(event); return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp); } nest = local_read(&cpu_buffer->committing); verify_event(cpu_buffer, event); if (WARN_ON_ONCE(!nest)) goto fail; /* Read the current saved nesting level time stamp */ if (likely(--nest < MAX_NEST)) return cpu_buffer->event_stamp[nest]; /* Shouldn't happen, warn if it does */ WARN_ONCE(1, "nest (%d) greater than max", nest); fail: rb_time_read(&cpu_buffer->write_stamp, &ts); return ts; } /** * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from * * Returns the number of pages that have content in the ring buffer. */ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) { size_t read; size_t lost; size_t cnt; read = local_read(&buffer->buffers[cpu]->pages_read); lost = local_read(&buffer->buffers[cpu]->pages_lost); cnt = local_read(&buffer->buffers[cpu]->pages_touched); if (WARN_ON_ONCE(cnt < lost)) return 0; cnt -= lost; /* The reader can read an empty page, but not more than that */ if (cnt < read) { WARN_ON_ONCE(read > cnt + 1); return 0; } return cnt - read; } static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; size_t nr_pages; size_t dirty; nr_pages = cpu_buffer->nr_pages; if (!nr_pages || !full) return true; /* * Add one as dirty will never equal nr_pages, as the sub-buffer * that the writer is on is not counted as dirty. * This is needed if "buffer_percent" is set to 100. */ dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1; return (dirty * 100) >= (full * nr_pages); } /* * rb_wake_up_waiters - wake up tasks waiting for ring buffer input * * Schedules a delayed work to wake up any task that is blocked on the * ring buffer waiters queue. */ static void rb_wake_up_waiters(struct irq_work *work) { struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); /* For waiters waiting for the first wake up */ (void)atomic_fetch_inc_release(&rbwork->seq); wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { /* Only cpu_buffer sets the above flags */ struct ring_buffer_per_cpu *cpu_buffer = container_of(rbwork, struct ring_buffer_per_cpu, irq_work); /* Called from interrupt context */ raw_spin_lock(&cpu_buffer->reader_lock); rbwork->wakeup_full = false; rbwork->full_waiters_pending = false; /* Waking up all waiters, they will reset the shortest full */ cpu_buffer->shortest_full = 0; raw_spin_unlock(&cpu_buffer->reader_lock); wake_up_all(&rbwork->full_waiters); } } /** * ring_buffer_wake_waiters - wake up any waiters on this ring buffer * @buffer: The ring buffer to wake waiters on * @cpu: The CPU buffer to wake waiters on * * In the case of a file that represents a ring buffer is closing, * it is prudent to wake up any waiters that are on this. */ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *rbwork; if (!buffer) return; if (cpu == RING_BUFFER_ALL_CPUS) { /* Wake up individual ones too. One level recursion */ for_each_buffer_cpu(buffer, cpu) ring_buffer_wake_waiters(buffer, cpu); rbwork = &buffer->irq_work; } else { if (WARN_ON_ONCE(!buffer->buffers)) return; if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; cpu_buffer = buffer->buffers[cpu]; /* The CPU buffer may not have been initialized yet */ if (!cpu_buffer) return; rbwork = &cpu_buffer->irq_work; } /* This can be called in any context */ irq_work_queue(&rbwork->work); } static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer; bool ret = false; /* Reads of all CPUs always waits for any data */ if (cpu == RING_BUFFER_ALL_CPUS) return !ring_buffer_empty(buffer); cpu_buffer = buffer->buffers[cpu]; if (!ring_buffer_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; if (!full) return true; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; ret = !pagebusy && full_hit(buffer, cpu, full); if (!ret && (!cpu_buffer->shortest_full || cpu_buffer->shortest_full > full)) { cpu_buffer->shortest_full = full; } raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } return ret; } static inline bool rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer, int cpu, int full, ring_buffer_cond_fn cond, void *data) { if (rb_watermark_hit(buffer, cpu, full)) return true; if (cond(data)) return true; /* * The events can happen in critical sections where * checking a work queue can cause deadlocks. * After adding a task to the queue, this flag is set * only to notify events to try to wake up the queue * using irq_work. * * We don't clear it even if the buffer is no longer * empty. The flag only causes the next event to run * irq_work to do the work queue wake up. The worse * that can happen if we race with !trace_empty() is that * an event will cause an irq_work to try to wake up * an empty queue. * * There's no reason to protect this flag either, as * the work queue and irq_work logic will do the necessary * synchronization for the wake ups. The only thing * that is necessary is that the wake up happens after * a task has been queued. It's OK for spurious wake ups. */ if (full) rbwork->full_waiters_pending = true; else rbwork->waiters_pending = true; return false; } struct rb_wait_data { struct rb_irq_work *irq_work; int seq; }; /* * The default wait condition for ring_buffer_wait() is to just to exit the * wait loop the first time it is woken up. */ static bool rb_wait_once(void *data) { struct rb_wait_data *rdata = data; struct rb_irq_work *rbwork = rdata->irq_work; return atomic_read_acquire(&rbwork->seq) != rdata->seq; } /** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * @cond: condition function to break out of wait (NULL to run once) * @data: the data to pass to @cond. * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, ring_buffer_cond_fn cond, void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct wait_queue_head *waitq; struct rb_irq_work *rbwork; struct rb_wait_data rdata; int ret = 0; /* * Depending on what the caller is waiting for, either any * data in any cpu buffer, or a specific buffer, put the * caller on the appropriate wait queue. */ if (cpu == RING_BUFFER_ALL_CPUS) { rbwork = &buffer->irq_work; /* Full only makes sense on per cpu reads */ full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; cpu_buffer = buffer->buffers[cpu]; rbwork = &cpu_buffer->irq_work; } if (full) waitq = &rbwork->full_waiters; else waitq = &rbwork->waiters; /* Set up to exit loop as soon as it is woken */ if (!cond) { cond = rb_wait_once; rdata.irq_work = rbwork; rdata.seq = atomic_read_acquire(&rbwork->seq); data = &rdata; } ret = wait_event_interruptible((*waitq), rb_wait_cond(rbwork, buffer, cpu, full, cond, data)); return ret; } /** * ring_buffer_poll_wait - poll on buffer input * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @filp: the file descriptor * @poll_table: The poll descriptor * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. * * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers, * zero otherwise. */ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *rbwork; if (cpu == RING_BUFFER_ALL_CPUS) { rbwork = &buffer->irq_work; full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return EPOLLERR; cpu_buffer = buffer->buffers[cpu]; rbwork = &cpu_buffer->irq_work; } if (full) { poll_wait(filp, &rbwork->full_waiters, poll_table); if (rb_watermark_hit(buffer, cpu, full)) return EPOLLIN | EPOLLRDNORM; /* * Only allow full_waiters_pending update to be seen after * the shortest_full is set (in rb_watermark_hit). If the * writer sees the full_waiters_pending flag set, it will * compare the amount in the ring buffer to shortest_full. * If the amount in the ring buffer is greater than the * shortest_full percent, it will call the irq_work handler * to wake up this list. The irq_handler will reset shortest_full * back to zero. That's done under the reader_lock, but * the below smp_mb() makes sure that the update to * full_waiters_pending doesn't leak up into the above. */ smp_mb(); rbwork->full_waiters_pending = true; return 0; } poll_wait(filp, &rbwork->waiters, poll_table); rbwork->waiters_pending = true; /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit * is set, the next event will wake the task up, but we can get stuck * if there's only a single event in. * * FIXME: Ideally, we need a memory barrier on the writer side as well, * but adding a memory barrier to all events will cause too much of a * performance hit in the fast path. We only need a memory barrier when * the buffer goes from empty to having content. But as this race is * extremely small, and it's not a problem if another event comes in, we * will fix it later. */ smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; return 0; } /* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(b, cond) \ ({ \ int _____ret = unlikely(cond); \ if (_____ret) { \ if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ struct ring_buffer_per_cpu *__b = \ (void *)b; \ atomic_inc(&__b->buffer->record_disabled); \ } else \ atomic_inc(&b->record_disabled); \ WARN_ON(1); \ } \ _____ret; \ }) /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 static inline u64 rb_time_stamp(struct trace_buffer *buffer) { u64 ts; /* Skip retpolines :-( */ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && likely(buffer->clock == trace_clock_local)) ts = trace_clock_local(); else ts = buffer->clock(); /* shift to debug/test normalization and TIME_EXTENTS */ return ts << DEBUG_SHIFT; } u64 ring_buffer_time_stamp(struct trace_buffer *buffer) { u64 time; preempt_disable_notrace(); time = rb_time_stamp(buffer); preempt_enable_notrace(); return time; } EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer, int cpu, u64 *ts) { /* Just stupid testing the normalize function and deltas */ *ts >>= DEBUG_SHIFT; } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); /* * Making the ring buffer lockless makes things tricky. * Although writes only happen on the CPU that they are on, * and they only need to worry about interrupts. Reads can * happen on any CPU. * * The reader page is always off the ring buffer, but when the * reader finishes with a page, it needs to swap its page with * a new one from the buffer. The reader needs to take from * the head (writes go to the tail). But if a writer is in overwrite * mode and wraps, it must push the head page forward. * * Here lies the problem. * * The reader must be careful to replace only the head page, and * not another one. As described at the top of the file in the * ASCII art, the reader sets its old page to point to the next * page after head. It then sets the page after head to point to * the old reader page. But if the writer moves the head page * during this operation, the reader could end up with the tail. * * We use cmpxchg to help prevent this race. We also do something * special with the page before head. We set the LSB to 1. * * When the writer must push the page forward, it will clear the * bit that points to the head page, move the head, and then set * the bit that points to the new head page. * * We also don't want an interrupt coming in and moving the head * page on another writer. Thus we use the second LSB to catch * that too. Thus: * * head->list->prev->next bit 1 bit 0 * ------- ------- * Normal page 0 0 * Points to head page 0 1 * New head page 1 0 * * Note we can not trust the prev pointer of the head page, because: * * +----+ +-----+ +-----+ * | |------>| T |---X--->| N | * | |<------| | | | * +----+ +-----+ +-----+ * ^ ^ | * | +-----+ | | * +----------| R |----------+ | * | |<-----------+ * +-----+ * * Key: ---X--> HEAD flag set in pointer * T Tail page * R Reader page * N Next page * * (see __rb_reserve_next() to see where this happens) * * What the above shows is that the reader just swapped out * the reader page with a page in the buffer, but before it * could make the new header point back to the new page added * it was preempted by a writer. The writer moved forward onto * the new page added by the reader and is about to move forward * again. * * You can see, it is legitimate for the previous pointer of * the head (or any page) not to point back to itself. But only * temporarily. */ #define RB_PAGE_NORMAL 0UL #define RB_PAGE_HEAD 1UL #define RB_PAGE_UPDATE 2UL #define RB_FLAG_MASK 3UL /* PAGE_MOVED is not part of the mask */ #define RB_PAGE_MOVED 4UL /* * rb_list_head - remove any bit */ static struct list_head *rb_list_head(struct list_head *list) { unsigned long val = (unsigned long)list; return (struct list_head *)(val & ~RB_FLAG_MASK); } /* * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to * the reader page). But if the next page is a header page, * its flags will be non zero. */ static inline int rb_is_head_page(struct buffer_page *page, struct list_head *list) { unsigned long val; val = (unsigned long)list->next; if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) return RB_PAGE_MOVED; return val & RB_FLAG_MASK; } /* * rb_is_reader_page * * The unique thing about the reader page, is that, if the * writer is ever on it, the previous pointer never points * back to the reader page. */ static bool rb_is_reader_page(struct buffer_page *page) { struct list_head *list = page->list.prev; return rb_list_head(list->next) != &page->list; } /* * rb_set_list_to_head - set a list_head to be pointing to head. */ static void rb_set_list_to_head(struct list_head *list) { unsigned long *ptr; ptr = (unsigned long *)&list->next; *ptr |= RB_PAGE_HEAD; *ptr &= ~RB_PAGE_UPDATE; } /* * rb_head_page_activate - sets up head page */ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *head; head = cpu_buffer->head_page; if (!head) return; /* * Set the previous list pointer to have the HEAD flag. */ rb_set_list_to_head(head->list.prev); if (cpu_buffer->ring_meta) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; meta->head_buffer = (unsigned long)head->page; } } static void rb_list_head_clear(struct list_head *list) { unsigned long *ptr = (unsigned long *)&list->next; *ptr &= ~RB_FLAG_MASK; } /* * rb_head_page_deactivate - clears head page ptr (for free list) */ static void rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *hd; /* Go through the whole list and clear any pointers found. */ rb_list_head_clear(cpu_buffer->pages); list_for_each(hd, cpu_buffer->pages) rb_list_head_clear(hd); } static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag, int new_flag) { struct list_head *list; unsigned long val = (unsigned long)&head->list; unsigned long ret; list = &prev->list; val &= ~RB_FLAG_MASK; ret = cmpxchg((unsigned long *)&list->next, val | old_flag, val | new_flag); /* check if the reader took the page */ if ((ret & ~RB_FLAG_MASK) != val) return RB_PAGE_MOVED; return ret & RB_FLAG_MASK; } static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_UPDATE); } static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_HEAD); } static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_NORMAL); } static inline void rb_inc_page(struct buffer_page **bpage) { struct list_head *p = rb_list_head((*bpage)->list.next); *bpage = list_entry(p, struct buffer_page, list); } static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *head; struct buffer_page *page; struct list_head *list; int i; if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) return NULL; /* sanity check */ list = cpu_buffer->pages; if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) return NULL; page = head = cpu_buffer->head_page; /* * It is possible that the writer moves the header behind * where we started, and we miss in one loop. * A second loop should grab the header, but we'll do * three loops just because I'm paranoid. */ for (i = 0; i < 3; i++) { do { if (rb_is_head_page(page, page->list.prev)) { cpu_buffer->head_page = page; return page; } rb_inc_page(&page); } while (page != head); } RB_WARN_ON(cpu_buffer, 1); return NULL; } static bool rb_head_page_replace(struct buffer_page *old, struct buffer_page *new) { unsigned long *ptr = (unsigned long *)&old->list.prev->next; unsigned long val; val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; return try_cmpxchg(ptr, &val, (unsigned long)&new->list); } /* * rb_tail_page_update - move the tail page forward */ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { unsigned long old_entries; unsigned long old_write; /* * The tail page now needs to be moved forward. * * We need to reset the tail page, but without messing * with possible erasing of data brought in by interrupts * that have moved the tail page and are currently on it. * * We add a counter to the write field to denote this. */ old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. */ barrier(); /* * If the tail page is still the same as what we think * it is, then it is up to us to update the tail * pointer. */ if (tail_page == READ_ONCE(cpu_buffer->tail_page)) { /* Zero the write counter */ unsigned long val = old_write & ~RB_WRITE_MASK; unsigned long eval = old_entries & ~RB_WRITE_MASK; /* * This will only succeed if an interrupt did * not come in and change it. In which case, we * do not want to modify it. * * We add (void) to let the compiler know that we do not care * about the return value of these functions. We use the * cmpxchg to only update if an interrupt did not already * do it for us. If the cmpxchg fails, we don't care. */ (void)local_cmpxchg(&next_page->write, old_write, val); (void)local_cmpxchg(&next_page->entries, old_entries, eval); /* * No need to worry about races with clearing out the commit. * it only can increment when a commit takes place. But that * only happens in the outer most nested commit. */ local_set(&next_page->page->commit, 0); /* Either we update tail_page or an interrupt does */ if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page)) local_inc(&cpu_buffer->pages_touched); } } static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { unsigned long val = (unsigned long)bpage; RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK); } static bool rb_check_links(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *list) { if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(list->next)->prev) != list)) return false; if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(list->prev)->next) != list)) return false; return true; } /** * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * * As a safety measure we check to make sure the data pages have not * been corrupted. */ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head, *tmp; unsigned long buffer_cnt; unsigned long flags; int nr_loops = 0; /* * Walk the linked list underpinning the ring buffer and validate all * its next and prev links. * * The check acquires the reader_lock to avoid concurrent processing * with code that could be modifying the list. However, the lock cannot * be held for the entire duration of the walk, as this would make the * time when interrupts are disabled non-deterministic, dependent on the * ring buffer size. Therefore, the code releases and re-acquires the * lock after checking each page. The ring_buffer_per_cpu.cnt variable * is then used to detect if the list was modified while the lock was * not held, in which case the check needs to be restarted. * * The code attempts to perform the check at most three times before * giving up. This is acceptable because this is only a self-validation * to detect problems early on. In practice, the list modification * operations are fairly spaced, and so this check typically succeeds at * most on the second try. */ again: if (++nr_loops > 3) return; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); head = rb_list_head(cpu_buffer->pages); if (!rb_check_links(cpu_buffer, head)) goto out_locked; buffer_cnt = cpu_buffer->cnt; tmp = head; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); while (true) { raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (buffer_cnt != cpu_buffer->cnt) { /* The list was updated, try again. */ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); goto again; } tmp = rb_list_head(tmp->next); if (tmp == head) /* The iteration circled back, all is done. */ goto out_locked; if (!rb_check_links(cpu_buffer, tmp)) goto out_locked; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } out_locked: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /* * Take an address, add the meta data size as well as the array of * array subbuffer indexes, then align it to a subbuffer size. * * This is used to help find the next per cpu subbuffer within a mapped range. */ static unsigned long rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) { addr += sizeof(struct ring_buffer_meta) + sizeof(int) * nr_subbufs; return ALIGN(addr, subbuf_size); } /* * Return the ring_buffer_meta for a given @cpu. */ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; unsigned long ptr = buffer->range_addr_start; struct ring_buffer_meta *meta; int nr_subbufs; if (!ptr) return NULL; /* When nr_pages passed in is zero, the first meta has already been initialized */ if (!nr_pages) { meta = (struct ring_buffer_meta *)ptr; nr_subbufs = meta->nr_subbufs; } else { meta = NULL; /* Include the reader page */ nr_subbufs = nr_pages + 1; } /* * The first chunk may not be subbuffer aligned, where as * the rest of the chunks are. */ if (cpu) { ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); ptr += subbuf_size * nr_subbufs; /* We can use multiplication to find chunks greater than 1 */ if (cpu > 1) { unsigned long size; unsigned long p; /* Save the beginning of this CPU chunk */ p = ptr; ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); ptr += subbuf_size * nr_subbufs; /* Now all chunks after this are the same size */ size = ptr - p; ptr += size * (cpu - 2); } } return (void *)ptr; } /* Return the start of subbufs given the meta pointer */ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) { int subbuf_size = meta->subbuf_size; unsigned long ptr; ptr = (unsigned long)meta; ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs); return (void *)ptr; } /* * Return a specific sub-buffer for a given @cpu defined by @idx. */ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) { struct ring_buffer_meta *meta; unsigned long ptr; int subbuf_size; meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu); if (!meta) return NULL; if (WARN_ON_ONCE(idx >= meta->nr_subbufs)) return NULL; subbuf_size = meta->subbuf_size; /* Map this buffer to the order that's in meta->buffers[] */ idx = meta->buffers[idx]; ptr = (unsigned long)rb_subbufs_from_meta(meta); ptr += subbuf_size * idx; if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end) return NULL; return (void *)ptr; } /* * See if the existing memory contains valid ring buffer data. * As the previous kernel must be the same as this kernel, all * the calculations (size of buffers and number of buffers) * must be the same. */ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, struct trace_buffer *buffer, int nr_pages) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; unsigned long buffers_start; unsigned long buffers_end; int i; /* Check the meta magic and meta struct size */ if (meta->magic != RING_BUFFER_META_MAGIC || meta->struct_size != sizeof(*meta)) { pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu); return false; } /* The subbuffer's size and number of subbuffers must match */ if (meta->subbuf_size != subbuf_size || meta->nr_subbufs != nr_pages + 1) { pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); return false; } buffers_start = meta->first_buffer; buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); /* Is the head and commit buffers within the range of buffers? */ if (meta->head_buffer < buffers_start || meta->head_buffer >= buffers_end) { pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu); return false; } if (meta->commit_buffer < buffers_start || meta->commit_buffer >= buffers_end) { pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu); return false; } subbuf = rb_subbufs_from_meta(meta); /* Is the meta buffers and the subbufs themselves have correct data? */ for (i = 0; i < meta->nr_subbufs; i++) { if (meta->buffers[i] < 0 || meta->buffers[i] >= meta->nr_subbufs) { pr_info("Ring buffer boot meta [%d] array out of range\n", cpu); return false; } if ((unsigned)local_read(&subbuf->commit) > subbuf_size) { pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu); return false; } subbuf = (void *)subbuf + subbuf_size; } return true; } static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, unsigned long long *timestamp, u64 *delta_ptr) { struct ring_buffer_event *event; u64 ts, delta; int events = 0; int e; *delta_ptr = 0; *timestamp = 0; ts = dpage->time_stamp; for (e = 0; e < tail; e += rb_event_length(event)) { event = (struct ring_buffer_event *)(dpage->data + e); switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, ts); if (delta < ts) { *delta_ptr = delta; *timestamp = ts; return -1; } ts = delta; break; case RINGBUF_TYPE_PADDING: if (event->time_delta == 1) break; fallthrough; case RINGBUF_TYPE_DATA: events++; ts += event->time_delta; break; default: return -1; } } *timestamp = ts; return events; } static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) { unsigned long long ts; u64 delta; int tail; tail = local_read(&dpage->commit); return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta); } /* If the meta data has been validated, now validate the events */ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; struct buffer_page *head_page; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; int i; if (!meta || !meta->head_buffer) return; /* Do the reader page first */ ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu); if (ret < 0) { pr_info("Ring buffer reader page is invalid\n"); goto invalid; } entries += ret; entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); local_set(&cpu_buffer->reader_page->entries, ret); head_page = cpu_buffer->head_page; /* If both the head and commit are on the reader_page then we are done. */ if (head_page == cpu_buffer->reader_page && head_page == cpu_buffer->commit_page) goto done; /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { /* Reader page has already been done */ if (head_page == cpu_buffer->reader_page) continue; ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); if (ret < 0) { pr_info("Ring buffer meta [%d] invalid buffer page\n", cpu_buffer->cpu); goto invalid; } entries += ret; entry_bytes += local_read(&head_page->page->commit); local_set(&cpu_buffer->head_page->entries, ret); if (head_page == cpu_buffer->commit_page) break; } if (head_page != cpu_buffer->commit_page) { pr_info("Ring buffer meta [%d] commit page not found\n", cpu_buffer->cpu); goto invalid; } done: local_set(&cpu_buffer->entries, entries); local_set(&cpu_buffer->entries_bytes, entry_bytes); pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu); return; invalid: /* The content of the buffers are invalid, reset the meta data */ meta->head_buffer = 0; meta->commit_buffer = 0; /* Reset the reader page */ local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); /* Reset all the subbuffers */ for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) { local_set(&head_page->entries, 0); local_set(&head_page->page->commit, 0); } } /* Used to calculate data delta */ static char rb_data_ptr[] = ""; #define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr) #define THIS_DATA_PTR ((unsigned long)rb_data_ptr) static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) { meta->text_addr = THIS_TEXT_PTR; meta->data_addr = THIS_DATA_PTR; } static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; unsigned long delta; void *subbuf; int cpu; int i; for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); if (rb_meta_valid(meta, cpu, buffer, nr_pages)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr; buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr; continue; } if (cpu < nr_cpu_ids - 1) next_meta = rb_range_meta(buffer, nr_pages, cpu + 1); else next_meta = (void *)buffer->range_addr_end; memset(meta, 0, next_meta - (void *)meta); meta->magic = RING_BUFFER_META_MAGIC; meta->struct_size = sizeof(*meta); meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; rb_meta_init_text_addr(meta); /* * The buffers[] array holds the order of the sub-buffers * that are after the meta data. The sub-buffers may * be swapped out when read and inserted into a different * location of the ring buffer. Although their addresses * remain the same, the buffers[] array contains the * index into the sub-buffers holding their actual order. */ for (i = 0; i < meta->nr_subbufs; i++) { meta->buffers[i] = i; rb_init_page(subbuf); subbuf += meta->subbuf_size; } } } static void *rbm_start(struct seq_file *m, loff_t *pos) { struct ring_buffer_per_cpu *cpu_buffer = m->private; struct ring_buffer_meta *meta = cpu_buffer->ring_meta; unsigned long val; if (!meta) return NULL; if (*pos > meta->nr_subbufs) return NULL; val = *pos; val++; return (void *)val; } static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return rbm_start(m, pos); } static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; struct ring_buffer_meta *meta = cpu_buffer->ring_meta; unsigned long val = (unsigned long)v; if (val == 1) { seq_printf(m, "head_buffer: %d\n", rb_meta_subbuf_idx(meta, (void *)meta->head_buffer)); seq_printf(m, "commit_buffer: %d\n", rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer)); seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size); seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs); return 0; } val -= 2; seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]); return 0; } static void rbm_stop(struct seq_file *m, void *p) { } static const struct seq_operations rb_meta_seq_ops = { .start = rbm_start, .next = rbm_next, .show = rbm_show, .stop = rbm_stop, }; int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu) { struct seq_file *m; int ret; ret = seq_open(file, &rb_meta_seq_ops); if (ret) return ret; m = file->private_data; m->private = buffer->buffers[cpu]; return 0; } /* Map the buffer_pages to the previous head and commit pages */ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; if (meta->head_buffer == (unsigned long)bpage->page) cpu_buffer->head_page = bpage; if (meta->commit_buffer == (unsigned long)bpage->page) { cpu_buffer->commit_page = bpage; cpu_buffer->tail_page = bpage; } } static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; long i; /* * Check if the available memory is there first. * Note, si_mem_available() only gives us a rough estimate of available * memory. It may not be accurate. But we don't care, we just want * to prevent doing any allocation when it is obvious that it is * not going to succeed. */ i = si_mem_available(); if (i < nr_pages) return -ENOMEM; /* * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails * gracefully without invoking oom-killer and the system is not * destabilized. */ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; /* * If a user thread allocates too much, and si_mem_available() * reports there's enough memory, even though there is not. * Make sure the OOM killer kills this thread. This can happen * even with RETRY_MAYFAIL because another task may be doing * an allocation after this task has taken all memory. * This is the task the OOM killer needs to take out during this * loop, even if it was triggered by an allocation somewhere else. */ if (user_thread) set_current_oom_origin(); if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); for (i = 0; i < nr_pages; i++) { struct page *page; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), mflags, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; rb_check_bpage(cpu_buffer, bpage); /* * Append the pages as for mapped buffers we want to keep * the order */ list_add_tail(&bpage->list, pages); if (meta) { /* A range was given. Use that for the buffer page */ bpage->page = rb_range_buffer(cpu_buffer, i + 1); if (!bpage->page) goto free_pages; /* If this is valid from a previous boot */ if (meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; } else { page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; bpage->page = page_address(page); rb_init_page(bpage->page); } bpage->order = cpu_buffer->buffer->subbuf_order; if (user_thread && fatal_signal_pending(current)) goto free_pages; } if (user_thread) clear_current_oom_origin(); return 0; free_pages: list_for_each_entry_safe(bpage, tmp, pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } if (user_thread) clear_current_oom_origin(); return -ENOMEM; } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { LIST_HEAD(pages); WARN_ON(!nr_pages); if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages)) return -ENOMEM; /* * The ring buffer page list is a circular list that does not * start and end with a list head. All page list items point to * other pages. */ cpu_buffer->pages = pages.next; list_del(&pages); cpu_buffer->nr_pages = nr_pages; rb_check_pages(cpu_buffer); return 0; } static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_meta *meta; struct buffer_page *bpage; struct page *page; int ret; cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!cpu_buffer) return NULL; cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); init_completion(&cpu_buffer->update_done); init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); mutex_init(&cpu_buffer->mapping_lock); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) goto fail_free_buffer; rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; if (buffer->range_addr_start) { /* * Range mapped buffers have the same restrictions as memory * mapped ones do. */ cpu_buffer->mapped = 1; cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu); bpage->page = rb_range_buffer(cpu_buffer, 0); if (!bpage->page) goto fail_free_reader; if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; } else { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; bpage->page = page_address(page); rb_init_page(bpage->page); } INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) goto fail_free_reader; rb_meta_validate_events(cpu_buffer); /* If the boot meta was valid then this has already been updated */ meta = cpu_buffer->ring_meta; if (!meta || !meta->head_buffer || !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) { if (meta && meta->head_buffer && (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) { pr_warn("Ring buffer meta buffers not all mapped\n"); if (!cpu_buffer->head_page) pr_warn(" Missing head_page\n"); if (!cpu_buffer->commit_page) pr_warn(" Missing commit_page\n"); if (!cpu_buffer->tail_page) pr_warn(" Missing tail_page\n"); } cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; rb_head_page_activate(cpu_buffer); if (cpu_buffer->ring_meta) meta->commit_buffer = meta->head_buffer; } else { /* The valid meta buffer still needs to activate the head page */ rb_head_page_activate(cpu_buffer); } return cpu_buffer; fail_free_reader: free_buffer_page(cpu_buffer->reader_page); fail_free_buffer: kfree(cpu_buffer); return NULL; } static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; irq_work_sync(&cpu_buffer->irq_work.work); free_buffer_page(cpu_buffer->reader_page); if (head) { rb_head_page_deactivate(cpu_buffer); list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } bpage = list_entry(head, struct buffer_page, list); free_buffer_page(bpage); } free_page((unsigned long)cpu_buffer->free_page); kfree(cpu_buffer); } static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, struct lock_class_key *key) { struct trace_buffer *buffer; long nr_pages; int subbuf_size; int bsize; int cpu; int ret; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); if (!buffer) return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; buffer->subbuf_order = order; subbuf_size = (PAGE_SIZE << order); buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE; /* Max payload is buffer page size - header (8bytes) */ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&buffer->irq_work.waiters); buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), GFP_KERNEL); if (!buffer->buffers) goto fail_free_cpumask; /* If start/end are specified, then that overrides size */ if (start && end) { unsigned long ptr; int n; size = end - start; size = size / nr_cpu_ids; /* * The number of sub-buffers (nr_pages) is determined by the * total size allocated minus the meta data size. * Then that is divided by the number of per CPU buffers * needed, plus account for the integer array index that * will be appended to the meta data. */ nr_pages = (size - sizeof(struct ring_buffer_meta)) / (subbuf_size + sizeof(int)); /* Need at least two pages plus the reader page */ if (nr_pages < 3) goto fail_free_buffers; again: /* Make sure that the size fits aligned */ for (n = 0, ptr = start; n < nr_cpu_ids; n++) { ptr += sizeof(struct ring_buffer_meta) + sizeof(int) * nr_pages; ptr = ALIGN(ptr, subbuf_size); ptr += subbuf_size * nr_pages; } if (ptr > end) { if (nr_pages <= 3) goto fail_free_buffers; nr_pages--; goto again; } /* nr_pages should not count the reader page */ nr_pages--; buffer->range_addr_start = start; buffer->range_addr_end = end; rb_range_meta_init(buffer, nr_pages); } else { /* need at least two pages */ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); if (nr_pages < 2) nr_pages = 2; } cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) goto fail_free_buffers; ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); if (ret < 0) goto fail_free_buffers; mutex_init(&buffer->mutex); return buffer; fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { if (buffer->buffers[cpu]) rb_free_cpu_buffer(buffer->buffers[cpu]); } kfree(buffer->buffers); fail_free_cpumask: free_cpumask_var(buffer->cpumask); fail_free_buffer: kfree(buffer); return NULL; } /** * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE * flag. This flag means that the buffer will overwrite old data * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ return alloc_buffer(size, flags, 0, 0, 0,key); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); /** * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @order: sub-buffer order * @start: start of allocated range * @range_size: size of allocated range * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE * flag. This flag means that the buffer will overwrite old data * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, struct lock_class_key *key) { return alloc_buffer(size, flags, order, start, start + range_size, key); } /** * ring_buffer_last_boot_delta - return the delta offset from last boot * @buffer: The buffer to return the delta from * @text: Return text delta * @data: Return data delta * * Returns: The true if the delta is non zero */ bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, long *data) { if (!buffer) return false; if (!buffer->last_text_delta) return false; *text = buffer->last_text_delta; *data = buffer->last_data_delta; return true; } /** * ring_buffer_free - free a ring buffer. * @buffer: the buffer to free. */ void ring_buffer_free(struct trace_buffer *buffer) { int cpu; cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); } EXPORT_SYMBOL_GPL(ring_buffer_free); void ring_buffer_set_clock(struct trace_buffer *buffer, u64 (*clock)(void)) { buffer->clock = clock; } void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs) { buffer->time_stamp_abs = abs; } bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) { return buffer->time_stamp_abs; } static inline unsigned long rb_page_entries(struct buffer_page *bpage) { return local_read(&bpage->entries) & RB_WRITE_MASK; } static inline unsigned long rb_page_write(struct buffer_page *bpage) { return local_read(&bpage->write) & RB_WRITE_MASK; } static bool rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; struct buffer_page *to_remove_page, *tmp_iter_page; struct buffer_page *last_page, *first_page; unsigned long nr_removed; unsigned long head_bit; int page_entries; head_bit = 0; raw_spin_lock_irq(&cpu_buffer->reader_lock); atomic_inc(&cpu_buffer->record_disabled); /* * We don't race with the readers since we have acquired the reader * lock. We also don't race with writers after disabling recording. * This makes it easy to figure out the first and the last page to be * removed from the list. We unlink all the pages in between including * the first and last pages. This is done in a busy loop so that we * lose the least number of traces. * The pages are freed after we restart recording and unlock readers. */ tail_page = &cpu_buffer->tail_page->list; /* * tail page might be on reader page, we remove the next page * from the ring buffer */ if (cpu_buffer->tail_page == cpu_buffer->reader_page) tail_page = rb_list_head(tail_page->next); to_remove = tail_page; /* start of pages to remove */ first_page = list_entry(rb_list_head(to_remove->next), struct buffer_page, list); for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { to_remove = rb_list_head(to_remove)->next; head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } /* Read iterators need to reset themselves when some pages removed */ cpu_buffer->pages_removed += nr_removed; next_page = rb_list_head(to_remove)->next; /* * Now we remove all pages between tail_page and next_page. * Make sure that we have head_bit value preserved for the * next page */ tail_page->next = (struct list_head *)((unsigned long)next_page | head_bit); next_page = rb_list_head(next_page); next_page->prev = tail_page; /* make sure pages points to a valid page in the ring buffer */ cpu_buffer->pages = next_page; cpu_buffer->cnt++; /* update head page */ if (head_bit) cpu_buffer->head_page = list_entry(next_page, struct buffer_page, list); /* pages are removed, resume tracing and then free the pages */ atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); /* last buffer page to remove */ last_page = list_entry(rb_list_head(to_remove), struct buffer_page, list); tmp_iter_page = first_page; do { cond_resched(); to_remove_page = tmp_iter_page; rb_inc_page(&tmp_iter_page); /* update the counters */ page_entries = rb_page_entries(to_remove_page); if (page_entries) { /* * If something was added to this page, it was full * since it is not the tail page. So we deduct the * bytes consumed in ring buffer from here. * Increment overrun to account for the lost events. */ local_add(page_entries, &cpu_buffer->overrun); local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); } /* * We have already removed references to this list item, just * free up the buffer_page and its page */ free_buffer_page(to_remove_page); nr_removed--; } while (to_remove_page != last_page); RB_WARN_ON(cpu_buffer, nr_removed); return nr_removed == 0; } static bool rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *pages = &cpu_buffer->new_pages; unsigned long flags; bool success; int retries; /* Can be called at early boot up, where interrupts must not been enabled */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * We are holding the reader lock, so the reader page won't be swapped * in the ring buffer. Now we are racing with the writer trying to * move head page and the tail page. * We are going to adapt the reader page update process where: * 1. We first splice the start and end of list of new pages between * the head page and its previous page. * 2. We cmpxchg the prev_page->next to point from head page to the * start of new pages list. * 3. Finally, we update the head->prev to the end of new list. * * We will try this process 10 times, to make sure that we don't keep * spinning. */ retries = 10; success = false; while (retries--) { struct list_head *head_page, *prev_page; struct list_head *last_page, *first_page; struct list_head *head_page_with_bit; struct buffer_page *hpage = rb_set_head_page(cpu_buffer); if (!hpage) break; head_page = &hpage->list; prev_page = head_page->prev; first_page = pages->next; last_page = pages->prev; head_page_with_bit = (struct list_head *) ((unsigned long)head_page | RB_PAGE_HEAD); last_page->next = head_page_with_bit; first_page->prev = prev_page; /* caution: head_page_with_bit gets updated on cmpxchg failure */ if (try_cmpxchg(&prev_page->next, &head_page_with_bit, first_page)) { /* * yay, we replaced the page pointer to our new list, * now, we just have to update to head page's prev * pointer to point to end of list */ head_page->prev = last_page; cpu_buffer->cnt++; success = true; break; } } if (success) INIT_LIST_HEAD(pages); /* * If we weren't successful in adding in new pages, warn and stop * tracing */ RB_WARN_ON(cpu_buffer, !success); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); /* free pages if they weren't inserted */ if (!success) { struct buffer_page *bpage, *tmp; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } } return success; } static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) { bool success; if (cpu_buffer->nr_pages_to_update > 0) success = rb_insert_pages(cpu_buffer); else success = rb_remove_pages(cpu_buffer, -cpu_buffer->nr_pages_to_update); if (success) cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; } static void update_pages_handler(struct work_struct *work) { struct ring_buffer_per_cpu *cpu_buffer = container_of(work, struct ring_buffer_per_cpu, update_pages_work); rb_update_pages(cpu_buffer); complete(&cpu_buffer->update_done); } /** * ring_buffer_resize - resize the ring buffer * @buffer: the buffer to resize. * @size: the new size. * @cpu_id: the cpu buffer to resize * * Minimum size is 2 * buffer->subbuf_size. * * Returns 0 on success and < 0 on failure. */ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long nr_pages; int cpu, err; /* * Always succeed at resizing a non-existent buffer: */ if (!buffer) return 0; /* Make sure the requested buffer exists */ if (cpu_id != RING_BUFFER_ALL_CPUS && !cpumask_test_cpu(cpu_id, buffer->cpumask)) return 0; nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) nr_pages = 2; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); if (cpu_id == RING_BUFFER_ALL_CPUS) { /* * Don't succeed if resizing is disabled, as a reader might be * manipulating the ring buffer and is expecting a sane state while * this is true. */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->resize_disabled)) { err = -EBUSY; goto out_err_unlock; } } /* calculate the pages to update */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; cpu_buffer->nr_pages_to_update = nr_pages - cpu_buffer->nr_pages; /* * nothing more to do for removing pages or no update */ if (cpu_buffer->nr_pages_to_update <= 0) continue; /* * to add pages, make sure all new pages can be * allocated without receiving ENOMEM */ INIT_LIST_HEAD(&cpu_buffer->new_pages); if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, &cpu_buffer->new_pages)) { /* not enough memory for new pages */ err = -ENOMEM; goto out_err; } cond_resched(); } cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary * since we can change their buffer sizes without any race. */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; /* Can't run something on an offline CPU. */ if (!cpu_online(cpu)) { rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { /* Run directly if possible. */ migrate_disable(); if (cpu != smp_processor_id()) { migrate_enable(); schedule_work_on(cpu, &cpu_buffer->update_pages_work); } else { update_pages_handler(&cpu_buffer->update_pages_work); migrate_enable(); } } } /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; if (cpu_online(cpu)) wait_for_completion(&cpu_buffer->update_done); cpu_buffer->nr_pages_to_update = 0; } cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; if (nr_pages == cpu_buffer->nr_pages) goto out; /* * Don't succeed if resizing is disabled, as a reader might be * manipulating the ring buffer and is expecting a sane state while * this is true. */ if (atomic_read(&cpu_buffer->resize_disabled)) { err = -EBUSY; goto out_err_unlock; } cpu_buffer->nr_pages_to_update = nr_pages - cpu_buffer->nr_pages; INIT_LIST_HEAD(&cpu_buffer->new_pages); if (cpu_buffer->nr_pages_to_update > 0 && __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, &cpu_buffer->new_pages)) { err = -ENOMEM; goto out_err; } cpus_read_lock(); /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { /* Run directly if possible. */ migrate_disable(); if (cpu_id == smp_processor_id()) { rb_update_pages(cpu_buffer); migrate_enable(); } else { migrate_enable(); schedule_work_on(cpu_id, &cpu_buffer->update_pages_work); wait_for_completion(&cpu_buffer->update_done); } } cpu_buffer->nr_pages_to_update = 0; cpus_read_unlock(); } out: /* * The ring buffer resize can happen with the ring buffer * enabled, so that the update disturbs the tracing as little * as possible. But if the buffer is disabled, we do not need * to worry about that, and we can take the time to verify * that the buffer is not corrupt. */ if (atomic_read(&buffer->record_disabled)) { atomic_inc(&buffer->record_disabled); /* * Even though the buffer was disabled, we must make sure * that it is truly disabled before calling rb_check_pages. * There could have been a race between checking * record_disable and incrementing it. */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; rb_check_pages(cpu_buffer); } atomic_dec(&buffer->record_disabled); } atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return 0; out_err: for_each_buffer_cpu(buffer, cpu) { struct buffer_page *bpage, *tmp; cpu_buffer = buffer->buffers[cpu]; cpu_buffer->nr_pages_to_update = 0; if (list_empty(&cpu_buffer->new_pages)) continue; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } } out_err_unlock: atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return err; } EXPORT_SYMBOL_GPL(ring_buffer_resize); void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val) { mutex_lock(&buffer->mutex); if (val) buffer->flags |= RB_FL_OVERWRITE; else buffer->flags &= ~RB_FL_OVERWRITE; mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; } static __always_inline struct ring_buffer_event * rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) { return __rb_page_index(cpu_buffer->reader_page, cpu_buffer->reader_page->read); } static struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { struct ring_buffer_event *event; struct buffer_page *iter_head_page = iter->head_page; unsigned long commit; unsigned length; if (iter->head != iter->next_event) return iter->event; /* * When the writer goes across pages, it issues a cmpxchg which * is a mb(), which will synchronize with the rmb here. * (see rb_tail_page_update() and __rb_reserve_next()) */ commit = rb_page_commit(iter_head_page); smp_rmb(); /* An event needs to be at least 8 bytes in size */ if (iter->head > commit - 8) goto reset; event = __rb_page_index(iter_head_page, iter->head); length = rb_event_length(event); /* * READ_ONCE() doesn't work on functions and we don't want the * compiler doing any crazy optimizations with length. */ barrier(); if ((iter->head + length) > commit || length > iter->event_size) /* Writer corrupted the read? */ goto reset; memcpy(iter->event, event, length); /* * If the page stamp is still the same after this rmb() then the * event was safely copied without the writer entering the page. */ smp_rmb(); /* Make sure the page didn't change since we read this */ if (iter->page_stamp != iter_head_page->page->time_stamp || commit > rb_page_commit(iter_head_page)) goto reset; iter->next_event = iter->head + length; return iter->event; reset: /* Reset to the beginning */ iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; iter->next_event = 0; iter->missed_events = 1; return NULL; } /* Size is determined by what has been committed */ static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage) & ~RB_MISSED_MASK; } static __always_inline unsigned rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) { return rb_page_commit(cpu_buffer->commit_page); } static __always_inline unsigned rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1; return addr - BUF_PAGE_HDR_SIZE; } static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* * The iterator could be on the reader page (it starts there). * But the head could have moved, since the reader was * found. Check for this case and assign the iterator * to the head page instead of next. */ if (iter->head_page == cpu_buffer->reader_page) iter->head_page = rb_set_head_page(cpu_buffer); else rb_inc_page(&iter->head_page); iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; iter->next_event = 0; } /* Return the index into the sub-buffers for a given sub-buffer */ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) { void *subbuf_array; subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs; subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size); return (subbuf - subbuf_array) / meta->subbuf_size; } static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; unsigned long old_head = (unsigned long)next_page->page; unsigned long new_head; rb_inc_page(&next_page); new_head = (unsigned long)next_page->page; /* * Only move it forward once, if something else came in and * moved it forward, then we don't want to touch it. */ (void)cmpxchg(&meta->head_buffer, old_head, new_head); } static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *reader) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; void *old_reader = cpu_buffer->reader_page->page; void *new_reader = reader->page; int id; id = reader->id; cpu_buffer->reader_page->id = id; reader->id = 0; meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader); meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader); /* The head pointer is the one after the reader */ rb_update_meta_head(cpu_buffer, reader); } /* * rb_handle_head_page - writer hit the head page * * Returns: +1 to retry page * 0 to continue * -1 on error */ static int rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { struct buffer_page *new_head; int entries; int type; int ret; entries = rb_page_entries(next_page); /* * The hard part is here. We need to move the head * forward, and protect against both readers on * other CPUs and writers coming in via interrupts. */ type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, RB_PAGE_HEAD); /* * type can be one of four: * NORMAL - an interrupt already moved it for us * HEAD - we are the first to get here. * UPDATE - we are the interrupt interrupting * a current move. * MOVED - a reader on another CPU moved the next * pointer to its reader page. Give up * and try again. */ switch (type) { case RB_PAGE_HEAD: /* * We changed the head to UPDATE, thus * it is our responsibility to update * the counters. */ local_add(entries, &cpu_buffer->overrun); local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); if (cpu_buffer->ring_meta) rb_update_meta_head(cpu_buffer, next_page); /* * The entries will be zeroed out when we move the * tail page. */ /* still more to do */ break; case RB_PAGE_UPDATE: /* * This is an interrupt that interrupt the * previous update. Still more to do. */ break; case RB_PAGE_NORMAL: /* * An interrupt came in before the update * and processed this for us. * Nothing left to do. */ return 1; case RB_PAGE_MOVED: /* * The reader is on another CPU and just did * a swap with our next_page. * Try again. */ return 1; default: RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ return -1; } /* * Now that we are here, the old head pointer is * set to UPDATE. This will keep the reader from * swapping the head page with the reader page. * The reader (on another CPU) will spin till * we are finished. * * We just need to protect against interrupts * doing the job. We will set the next pointer * to HEAD. After that, we set the old pointer * to NORMAL, but only if it was HEAD before. * otherwise we are an interrupt, and only * want the outer most commit to reset it. */ new_head = next_page; rb_inc_page(&new_head); ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, RB_PAGE_NORMAL); /* * Valid returns are: * HEAD - an interrupt came in and already set it. * NORMAL - One of two things: * 1) We really set it. * 2) A bunch of interrupts came in and moved * the page forward again. */ switch (ret) { case RB_PAGE_HEAD: case RB_PAGE_NORMAL: /* OK */ break; default: RB_WARN_ON(cpu_buffer, 1); return -1; } /* * It is possible that an interrupt came in, * set the head up, then more interrupts came in * and moved it again. When we get back here, * the page would have been set to NORMAL but we * just set it back to HEAD. * * How do you detect this? Well, if that happened * the tail page would have moved. */ if (ret == RB_PAGE_NORMAL) { struct buffer_page *buffer_tail_page; buffer_tail_page = READ_ONCE(cpu_buffer->tail_page); /* * If the tail had moved passed next, then we need * to reset the pointer. */ if (buffer_tail_page != tail_page && buffer_tail_page != next_page) rb_head_page_set_normal(cpu_buffer, new_head, next_page, RB_PAGE_HEAD); } /* * If this was the outer most commit (the one that * changed the original pointer from HEAD to UPDATE), * then it is up to us to reset it to NORMAL. */ if (type == RB_PAGE_HEAD) { ret = rb_head_page_set_normal(cpu_buffer, next_page, tail_page, RB_PAGE_UPDATE); if (RB_WARN_ON(cpu_buffer, ret != RB_PAGE_UPDATE)) return -1; } return 0; } static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; unsigned long length = info->length; /* * Only the event that crossed the page boundary * must fill the old tail_page with padding. */ if (tail >= bsize) { /* * If the page was filled, then we still need * to update the real_end. Reset it to zero * and the reader will ignore it. */ if (tail == bsize) tail_page->real_end = 0; local_sub(length, &tail_page->write); return; } event = __rb_page_index(tail_page, tail); /* * Save the original length to the meta data. * This will be used by the reader to add lost event * counter. */ tail_page->real_end = tail; /* * If this event is bigger than the minimum size, then * we need to be careful that we don't subtract the * write counter enough to allow another writer to slip * in on this page. * We put in a discarded commit instead, to make sure * that this space is not used again, and this space will * not be accounted into 'entries_bytes'. * * If we are less than the minimum size, we don't need to * worry about it. */ if (tail > (bsize - RB_EVNT_MIN_SIZE)) { /* No room for any events */ /* Mark the rest of the page with padding */ rb_event_set_padding(event); /* Make sure the padding is visible before the write update */ smp_wmb(); /* Set the write back to the previous setting */ local_sub(length, &tail_page->write); return; } /* Put in a discarded event */ event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; /* account for padding bytes */ local_add(bsize - tail, &cpu_buffer->entries_bytes); /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); /* Set write to end of buffer */ length = (tail + length) - bsize; local_sub(length, &tail_page->write); } static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); /* * This is the slow path, force gcc not to inline it. */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct trace_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; next_page = tail_page; rb_inc_page(&next_page); /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ if (unlikely(next_page == commit_page)) { local_inc(&cpu_buffer->commit_overrun); goto out_reset; } /* * This is where the fun begins! * * We are fighting against races between a reader that * could be on another CPU trying to swap its reader * page with the buffer head. * * We are also fighting against interrupts coming in and * moving the head or tail on us as well. * * If the next page is the head page then we have filled * the buffer, unless the commit page is still on the * reader page. */ if (rb_is_head_page(next_page, &tail_page->list)) { /* * If the commit is not on the reader page, then * move the header page. */ if (!rb_is_reader_page(cpu_buffer->commit_page)) { /* * If we are not in overwrite mode, * this is easy, just stop here. */ if (!(buffer->flags & RB_FL_OVERWRITE)) { local_inc(&cpu_buffer->dropped_events); goto out_reset; } ret = rb_handle_head_page(cpu_buffer, tail_page, next_page); if (ret < 0) goto out_reset; if (ret) goto out_again; } else { /* * We need to be careful here too. The * commit page could still be on the reader * page. We could have a small buffer, and * have filled up the buffer with events * from interrupts and such, and wrapped. * * Note, if the tail page is also on the * reader_page, we let it move out. */ if (unlikely((cpu_buffer->commit_page != cpu_buffer->tail_page) && (cpu_buffer->commit_page == cpu_buffer->reader_page))) { local_inc(&cpu_buffer->commit_overrun); goto out_reset; } } } rb_tail_page_update(cpu_buffer, tail_page, next_page); out_again: rb_reset_tail(cpu_buffer, tail, info); /* Commit what we have for now. */ rb_end_commit(cpu_buffer); /* rb_end_commit() decs committing */ local_inc(&cpu_buffer->committing); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ rb_reset_tail(cpu_buffer, tail, info); return NULL; } /* Slow path */ static struct ring_buffer_event * rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) event->type_len = RINGBUF_TYPE_TIME_STAMP; else event->type_len = RINGBUF_TYPE_TIME_EXTEND; /* Not the first event on the page, or not delta? */ if (abs || rb_event_index(cpu_buffer, event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { /* nope, just zero it */ event->time_delta = 0; event->array[0] = 0; } return skip_time_extend(event); } #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline bool sched_clock_stable(void) { return true; } #endif static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { u64 write_stamp; WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, (unsigned long long)info->before, (unsigned long long)info->after, (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); } static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event **event, struct rb_event_info *info, u64 *delta, unsigned int *length) { bool abs = info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); if (unlikely(info->delta > (1ULL << 59))) { /* * Some timers can use more than 59 bits, and when a timestamp * is added to the buffer, it will lose those bits. */ if (abs && (info->ts & TS_MSB)) { info->delta &= ABS_TS_MASK; /* did the clock go backwards */ } else if (info->before == info->after && info->before > info->ts) { /* not interrupted */ static int once; /* * This is possible with a recalibrating of the TSC. * Do not produce a call stack, but just report it. */ if (!once) { once++; pr_warn("Ring buffer clock went backwards: %llu -> %llu\n", info->before, info->ts); } } else rb_check_timestamp(cpu_buffer, info); if (!abs) info->delta = 0; } *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; } /** * rb_update_event - update event type and data * @cpu_buffer: The per cpu buffer of the @event * @event: the event to update * @info: The info to update the @event with (contains length and delta) * * Update the type and data fields of the @event. The length * is the actual size that is written to the ring buffer, * and with this, we can determine what to place into the * data field. */ static void rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, struct rb_event_info *info) { unsigned length = info->length; u64 delta = info->delta; unsigned int nest = local_read(&cpu_buffer->committing) - 1; if (!WARN_ON_ONCE(nest >= MAX_NEST)) cpu_buffer->event_stamp[nest] = info->ts; /* * If we need to add a timestamp, then we * add it to the start of the reserved space. */ if (unlikely(info->add_timestamp)) rb_add_timestamp(cpu_buffer, &event, info, &delta, &length); event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { event->type_len = 0; event->array[0] = length; } else event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); } static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ /* zero length can cause confusions */ if (!length) length++; if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; length = ALIGN(length, RB_ARCH_ALIGNMENT); /* * In case the time delta is larger than the 27 bits for it * in the header, we need to add a timestamp. If another * event comes in when trying to discard this one to increase * the length, then the timestamp will be added in the allocated * space of this event. If length is bigger than the size needed * for the TIME_EXTEND, then padding has to be used. The events * length must be either RB_LEN_TIME_EXTEND, or greater than or equal * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. * As length is a multiple of 4, we only need to worry if it * is 12 (RB_LEN_TIME_EXTEND + 4). */ if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) length += RB_ALIGNMENT; return length; } static inline bool rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long new_index, old_index; struct buffer_page *bpage; unsigned long addr; new_index = rb_event_index(cpu_buffer, event); old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); bpage = READ_ONCE(cpu_buffer->tail_page); /* * Make sure the tail_page is still the same and * the next write location is the end of this event */ if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); /* * For the before_stamp to be different than the write_stamp * to make sure that the next event adds an absolute * value and does not rely on the saved write stamp, which * is now going to be bogus. * * By setting the before_stamp to zero, the next event * is not going to use the write_stamp and will instead * create an absolute timestamp. This means there's no * reason to update the wirte_stamp! */ rb_time_set(&cpu_buffer->before_stamp, 0); /* * If an event were to come in now, it would see that the * write_stamp and the before_stamp are different, and assume * that this event just added itself before updating * the write stamp. The interrupting event will fix the * write stamp for us, and use an absolute timestamp. */ /* * This is on the tail page. It is possible that * a write could come in and move the tail page * and write to the next page. That is fine * because we just shorten what is on this page. */ old_index += write_mask; new_index += write_mask; /* caution: old_index gets updated on cmpxchg failure */ if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) { /* update counters */ local_sub(event_length, &cpu_buffer->entries_bytes); return true; } } /* could not discard */ return false; } static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->committing); local_inc(&cpu_buffer->commits); } static __always_inline void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long max_count; /* * We only race with interrupts and NMIs on this CPU. * If we own the commit event, then we can commit * all others that interrupted us, since the interruptions * are in stack format (they finish before they come * back to us). This allows us to do a simple loop to * assign the commit to the tail. */ again: max_count = cpu_buffer->nr_pages * 100; while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) return; if (RB_WARN_ON(cpu_buffer, rb_is_reader_page(cpu_buffer->tail_page))) return; /* * No need for a memory barrier here, as the update * of the tail_page did it for this page. */ local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); if (cpu_buffer->ring_meta) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; } /* add barrier to keep gcc from optimizing too much */ barrier(); } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { /* Make sure the readers see the content of what is committed. */ smp_wmb(); local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->commit_page->page->commit) & ~RB_WRITE_MASK); barrier(); } /* again, keep gcc from optimizing */ barrier(); /* * If an interrupt came in just after the first while loop * and pushed the tail page forward, we will be left with * a dangling commit that will never go forward. */ if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page))) goto again; } static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; if (RB_WARN_ON(cpu_buffer, !local_read(&cpu_buffer->committing))) return; again: commits = local_read(&cpu_buffer->commits); /* synchronize with interrupts */ barrier(); if (local_read(&cpu_buffer->committing) == 1) rb_set_commit_to_write(cpu_buffer); local_dec(&cpu_buffer->committing); /* synchronize with interrupts */ barrier(); /* * Need to account for interrupts coming in between the * updating of the commit page and the clearing of the * committing counter. */ if (unlikely(local_read(&cpu_buffer->commits) != commits) && !local_read(&cpu_buffer->committing)) { local_inc(&cpu_buffer->committing); goto again; } } static inline void rb_event_discard(struct ring_buffer_event *event) { if (extended_time(event)) event = skip_time_extend(event); /* array[0] holds the actual length for the discarded event */ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ if (!event->time_delta) event->time_delta = 1; } static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->entries); rb_end_commit(cpu_buffer); } static __always_inline void rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&buffer->irq_work.work); } if (cpu_buffer->irq_work.waiters_pending) { cpu_buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&cpu_buffer->irq_work.work); } if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched)) return; if (cpu_buffer->reader_page == cpu_buffer->commit_page) return; if (!cpu_buffer->irq_work.full_waiters_pending) return; cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full)) return; cpu_buffer->irq_work.wakeup_full = true; cpu_buffer->irq_work.full_waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&cpu_buffer->irq_work.work); } #ifdef CONFIG_RING_BUFFER_RECORD_RECURSION # define do_ring_buffer_record_recursion() \ do_ftrace_record_recursion(_THIS_IP_, _RET_IP_) #else # define do_ring_buffer_record_recursion() do { } while (0) #endif /* * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can * be modified more than once via an interrupt. To pass this * information from the lock to the unlock without having to * access the 'in_interrupt()' functions again (which do show * a bit of overhead in something as critical as function tracing, * we use a bitmask trick. * * bit 1 = NMI context * bit 2 = IRQ context * bit 3 = SoftIRQ context * bit 4 = normal context. * * This works because this is the order of contexts that can * preempt other contexts. A SoftIRQ never preempts an IRQ * context. * * When the context is determined, the corresponding bit is * checked and set (if it was set, then a recursion of that context * happened). * * On unlock, we need to clear this bit. To do so, just subtract * 1 from the current_context and AND it to itself. * * (binary) * 101 - 1 = 100 * 101 & 100 = 100 (clearing bit zero) * * 1010 - 1 = 1001 * 1010 & 1001 = 1000 (clearing bit 1) * * The least significant bit can be cleared this way, and it * just so happens that it is the same bit corresponding to * the current context. * * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit * is set when a recursion is detected at the current context, and if * the TRANSITION bit is already set, it will fail the recursion. * This is needed because there's a lag between the changing of * interrupt context and updating the preempt count. In this case, * a false positive will be found. To handle this, one extra recursion * is allowed, and this is done by the TRANSITION bit. If the TRANSITION * bit is already set, then it is considered a recursion and the function * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. * * On the trace_recursive_unlock(), the TRANSITION bit will be the first * to be cleared. Even if it wasn't the context that set it. That is, * if an interrupt comes in while NORMAL bit is set and the ring buffer * is called before preempt_count() is updated, since the check will * be on the NORMAL bit, the TRANSITION bit will then be set. If an * NMI then comes in, it will set the NMI bit, but when the NMI code * does the trace_recursive_unlock() it will clear the TRANSITION bit * and leave the NMI bit set. But this is fine, because the interrupt * code that set the TRANSITION bit will then clear the NMI bit when it * calls trace_recursive_unlock(). If another NMI comes in, it will * set the TRANSITION bit and continue. * * Note: The TRANSITION bit only handles a single transition between context. */ static __always_inline bool trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; int bit = interrupt_context_level(); bit = RB_CTX_NORMAL - bit; if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { /* * It is possible that this was called by transitioning * between interrupt context, and preempt_count() has not * been updated yet. In this case, use the TRANSITION bit. */ bit = RB_CTX_TRANSITION; if (val & (1 << (bit + cpu_buffer->nest))) { do_ring_buffer_record_recursion(); return true; } } val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; return false; } static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->current_context &= cpu_buffer->current_context - (1 << cpu_buffer->nest); } /* The recursive locking above uses 5 bits */ #define NESTED_BITS 5 /** * ring_buffer_nest_start - Allow to trace while nested * @buffer: The ring buffer to modify * * The ring buffer has a safety mechanism to prevent recursion. * But there may be a case where a trace needs to be done while * tracing something else. In this case, calling this function * will allow this function to nest within a currently active * ring_buffer_lock_reserve(). * * Call this function before calling another ring_buffer_lock_reserve() and * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). */ void ring_buffer_nest_start(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* Enabled by ring_buffer_nest_end() */ preempt_disable_notrace(); cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* This is the shift value for the above recursive locking */ cpu_buffer->nest += NESTED_BITS; } /** * ring_buffer_nest_end - Allow to trace while nested * @buffer: The ring buffer to modify * * Must be called after ring_buffer_nest_start() and after the * ring_buffer_unlock_commit(). */ void ring_buffer_nest_end(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* disabled by ring_buffer_nest_start() */ cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* This is the shift value for the above recursive locking */ cpu_buffer->nest -= NESTED_BITS; preempt_enable_notrace(); } /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to * * This commits the data to the ring buffer, and releases any locks held. * * Must be paired with ring_buffer_lock_reserve. */ int ring_buffer_unlock_commit(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); return 0; } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); /* Special value to validate all deltas on a page. */ #define CHECK_FULL_PAGE 1L #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS static const char *show_irq_str(int bits) { const char *type[] = { ".", // 0 "s", // 1 "h", // 2 "Hs", // 3 "n", // 4 "Ns", // 5 "Nh", // 6 "NHs", // 7 }; return type[bits]; } /* Assume this is a trace event */ static const char *show_flags(struct ring_buffer_event *event) { struct trace_entry *entry; int bits = 0; if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) return "X"; entry = ring_buffer_event_data(event); if (entry->flags & TRACE_FLAG_SOFTIRQ) bits |= 1; if (entry->flags & TRACE_FLAG_HARDIRQ) bits |= 2; if (entry->flags & TRACE_FLAG_NMI) bits |= 4; return show_irq_str(bits); } static const char *show_irq(struct ring_buffer_event *event) { struct trace_entry *entry; if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) return ""; entry = ring_buffer_event_data(event); if (entry->flags & TRACE_FLAG_IRQS_OFF) return "d"; return ""; } static const char *show_interrupt_level(void) { unsigned long pc = preempt_count(); unsigned char level = 0; if (pc & SOFTIRQ_OFFSET) level |= 1; if (pc & HARDIRQ_MASK) level |= 2; if (pc & NMI_MASK) level |= 4; return show_irq_str(level); } static void dump_buffer_page(struct buffer_data_page *bpage, struct rb_event_info *info, unsigned long tail) { struct ring_buffer_event *event; u64 ts, delta; int e; ts = bpage->time_stamp; pr_warn(" [%lld] PAGE TIME STAMP\n", ts); for (e = 0; e < tail; e += rb_event_length(event)) { event = (struct ring_buffer_event *)(bpage->data + e); switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n", e, ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); ts = rb_fix_abs_ts(delta, ts); pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n", e, ts, delta); break; case RINGBUF_TYPE_PADDING: ts += event->time_delta; pr_warn(" 0x%x: [%lld] delta:%d PADDING\n", e, ts, event->time_delta); break; case RINGBUF_TYPE_DATA: ts += event->time_delta; pr_warn(" 0x%x: [%lld] delta:%d %s%s\n", e, ts, event->time_delta, show_flags(event), show_irq(event)); break; default: break; } } pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e); } static DEFINE_PER_CPU(atomic_t, checking); static atomic_t ts_dump; #define buffer_warn_return(fmt, ...) \ do { \ /* If another report is happening, ignore this one */ \ if (atomic_inc_return(&ts_dump) != 1) { \ atomic_dec(&ts_dump); \ goto out; \ } \ atomic_inc(&cpu_buffer->record_disabled); \ pr_warn(fmt, ##__VA_ARGS__); \ dump_buffer_page(bpage, info, tail); \ atomic_dec(&ts_dump); \ /* There's some cases in boot up that this can happen */ \ if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ /* Do not re-enable checking */ \ return; \ } while (0) /* * Check if the current event time stamp matches the deltas on * the buffer page. */ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { struct buffer_data_page *bpage; u64 ts, delta; bool full = false; int ret; bpage = info->tail_page->page; if (tail == CHECK_FULL_PAGE) { full = true; tail = local_read(&bpage->commit); } else if (info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) { /* Ignore events with absolute time stamps */ return; } /* * Do not check the first event (skip possible extends too). * Also do not check if previous events have not been committed. */ if (tail <= 8 || tail > local_read(&bpage->commit)) return; /* * If this interrupted another event, */ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta); if (ret < 0) { if (delta < ts) { buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", cpu_buffer->cpu, ts, delta); goto out; } } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, full ? " (full)" : "", show_interrupt_level()); } out: atomic_dec(this_cpu_ptr(&checking)); } #else static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { } #endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); rb_time_read(&cpu_buffer->before_stamp, &info->before); rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) { info->delta = info->ts; } else { /* * If interrupting an event time update, we may need an * absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ if (!w) { /* Use the sub-buffer timestamp */ info->delta = 0; } else if (unlikely(info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { info->delta = info->ts - info->after; if (unlikely(test_time_stamp(info->delta))) { info->add_timestamp |= RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } } } /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); /*C*/ write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ write &= RB_WRITE_MASK; tail = write - info->length; /* See if we shot pass the end of this buffer page */ if (unlikely(write > cpu_buffer->buffer->subbuf_size)) { check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } if (likely(tail == w)) { /* Nothing interrupted us between A and C */ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); /* * If something came in between C and D, the write stamp * may now not be in sync. But that's fine as the before_stamp * will be different and then next event will just be forced * to use an absolute timestamp. */ if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ info->delta = info->ts - info->after; else /* Just use full timestamp for interrupting event */ info->delta = info->ts; check_buffer(cpu_buffer, info, tail); } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ /* Save the old before_stamp */ rb_time_read(&cpu_buffer->before_stamp, &info->before); /* * Read a new timestamp and update the before_stamp to make * the next event after this one force using an absolute * timestamp. This is in case an interrupt were to come in * between E and F. */ ts = rb_time_stamp(cpu_buffer->buffer); rb_time_set(&cpu_buffer->before_stamp, ts); barrier(); /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && info->after == info->before && info->after < ts) { /* * Nothing came after this event between C and F, it is * safe to use info->after for the delta as it * matched info->before and is still valid. */ info->delta = ts - info->after; } else { /* * Interrupted between C and F: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that * came after this will still be correct (as they would * have built their delta on the previous event. */ info->delta = 0; } info->ts = ts; info->add_timestamp &= ~RB_ADD_STAMP_FORCE; } /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ if (unlikely(!tail && !(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) info->delta = 0; /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); /* * If this is the first commit on the page, then update * its timestamp. */ if (unlikely(!tail)) tail_page->page->time_stamp = info->ts; /* account for these added bytes */ local_add(info->length, &cpu_buffer->entries_bytes); return event; } static __always_inline struct ring_buffer_event * rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; int add_ts_default; /* ring buffer does cmpxchg, make sure it is safe in NMI context */ if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && (unlikely(in_nmi()))) { return NULL; } rb_start_commit(cpu_buffer); /* The commit page can not change after this */ #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /* * Due to the ability to swap a cpu buffer from a buffer * it is possible it was swapped before we committed. * (committing stops a swap). We check for it here and * if it happened, we have to fail the write. */ barrier(); if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) { local_dec(&cpu_buffer->committing); local_dec(&cpu_buffer->commits); return NULL; } #endif info.length = rb_calculate_event_length(length); if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; if (info.length > cpu_buffer->buffer->max_data_size) goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; } again: info.add_timestamp = add_ts_default; info.delta = 0; /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop * back here. Even with heavy interrupts happening, this * should only happen a few times in a row. If this happens * 1000 times in a row, there must be either an interrupt * storm or we have something buggy. * Bail! */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) { if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND)) info.length -= RB_LEN_TIME_EXTEND; goto again; } if (likely(event)) return event; out_fail: rb_end_commit(cpu_buffer); return NULL; } /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) * * Returns a reserved event on the ring buffer to copy directly to. * The user of this interface will need to get the body to write into * and can use the ring_buffer_event_data() interface. * * The length is the length of the data needed, not the event length * which also includes the event header. * * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. * If NULL is returned, then nothing has been allocated or locked. */ struct ring_buffer_event * ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int cpu; /* If we are tracing schedule, we don't want to recurse */ preempt_disable_notrace(); if (unlikely(atomic_read(&buffer->record_disabled))) goto out; cpu = raw_smp_processor_id(); if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask))) goto out; cpu_buffer = buffer->buffers[cpu]; if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; if (unlikely(length > buffer->max_data_size)) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out_unlock; return event; out_unlock: trace_recursive_unlock(cpu_buffer); out: preempt_enable_notrace(); return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer * to the page it is on. This may only be called before the commit * takes place. */ static inline void rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; struct buffer_page *bpage = cpu_buffer->commit_page; struct buffer_page *start; addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); /* Do the likely case first */ if (likely(bpage->page == (void *)addr)) { local_dec(&bpage->entries); return; } /* * Because the commit page may be on the reader page we * start with the next page and check the end loop there. */ rb_inc_page(&bpage); start = bpage; do { if (bpage->page == (void *)addr) { local_dec(&bpage->entries); return; } rb_inc_page(&bpage); } while (bpage != start); /* commit not part of this buffer?? */ RB_WARN_ON(cpu_buffer, 1); } /** * ring_buffer_discard_commit - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * * Sometimes an event that is in the ring buffer needs to be ignored. * This function lets the user discard an event in the ring buffer * and then that event will not be read later. * * This function only works if it is called before the item has been * committed. It will try to free the event from the ring buffer * if another event has not been added behind it. * * If another event has been added behind it, it will set the event * up as discarded, and perform the commit. * * If this function is called, do not call ring_buffer_unlock_commit on * the event. */ void ring_buffer_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* The event is discarded regardless */ rb_event_discard(event); cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* * This must only be called if the event has not been * committed yet. Thus we can assume that preemption * is still disabled. */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); if (rb_try_to_discard(cpu_buffer, event)) goto out; out: rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); } EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. * @length: The length of the data being written (excluding the event header) * @data: The data to write to the buffer. * * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as * one function. If you already have the data to write to the buffer, it * may be easier to simply call this function. * * Note, like ring_buffer_lock_reserve, the length is the length of the data * and not the length of the event which would hold the header. */ int ring_buffer_write(struct trace_buffer *buffer, unsigned long length, void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; void *body; int ret = -EBUSY; int cpu; preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out; cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->record_disabled)) goto out; if (length > buffer->max_data_size) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out_unlock; body = rb_event_data(event); memcpy(body, data, length); rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); ret = 0; out_unlock: trace_recursive_unlock(cpu_buffer); out: preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_write); static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; struct buffer_page *head = rb_set_head_page(cpu_buffer); struct buffer_page *commit = cpu_buffer->commit_page; /* In case of error, head will be NULL */ if (unlikely(!head)) return true; /* Reader should exhaust content in reader page */ if (reader->read != rb_page_size(reader)) return false; /* * If writers are committing on the reader page, knowing all * committed content has been read, the ring buffer is empty. */ if (commit == reader) return true; /* * If writers are committing on a page other than reader page * and head page, there should always be content to read. */ if (commit != head) return false; /* * Writers are committing on the head page, we just need * to care about there're committed data, and the reader will * swap reader page with head page when it is to read data. */ return rb_page_commit(commit) == 0; } /** * ring_buffer_record_disable - stop all writes into the buffer * @buffer: The ring buffer to stop writes to. * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable(struct trace_buffer *buffer) { atomic_inc(&buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable); /** * ring_buffer_record_enable - enable writes to the buffer * @buffer: The ring buffer to enable writes * * Note, multiple disables will need the same number of enables * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable(struct trace_buffer *buffer) { atomic_dec(&buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable); /** * ring_buffer_record_off - stop all writes into the buffer * @buffer: The ring buffer to stop writes to. * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * This is different than ring_buffer_record_disable() as * it works like an on/off switch, where as the disable() version * must be paired with a enable(). */ void ring_buffer_record_off(struct trace_buffer *buffer) { unsigned int rd; unsigned int new_rd; rd = atomic_read(&buffer->record_disabled); do { new_rd = rd | RB_BUFFER_OFF; } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd)); } EXPORT_SYMBOL_GPL(ring_buffer_record_off); /** * ring_buffer_record_on - restart writes into the buffer * @buffer: The ring buffer to start writes to. * * This enables all writes to the buffer that was disabled by * ring_buffer_record_off(). * * This is different than ring_buffer_record_enable() as * it works like an on/off switch, where as the enable() version * must be paired with a disable(). */ void ring_buffer_record_on(struct trace_buffer *buffer) { unsigned int rd; unsigned int new_rd; rd = atomic_read(&buffer->record_disabled); do { new_rd = rd & ~RB_BUFFER_OFF; } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd)); } EXPORT_SYMBOL_GPL(ring_buffer_record_on); /** * ring_buffer_record_is_on - return true if the ring buffer can write * @buffer: The ring buffer to see if write is enabled * * Returns true if the ring buffer is in a state that it accepts writes. */ bool ring_buffer_record_is_on(struct trace_buffer *buffer) { return !atomic_read(&buffer->record_disabled); } /** * ring_buffer_record_is_set_on - return true if the ring buffer is set writable * @buffer: The ring buffer to see if write is set enabled * * Returns true if the ring buffer is set writable by ring_buffer_record_on(). * Note that this does NOT mean it is in a writable state. * * It may return true when the ring buffer has been disabled by * ring_buffer_record_disable(), as that is a temporary disabling of * the ring buffer. */ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) { return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); } /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); /** * ring_buffer_record_enable_cpu - enable writes to the buffer * @buffer: The ring buffer to enable writes * @cpu: The CPU to enable. * * Note, multiple disables will need the same number of enables * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); /* * The total entries in the ring buffer is the running counter * of entries entered into the ring buffer, minus the sum of * the entries read from the ring buffer and the number of * entries that were overwritten. */ static inline unsigned long rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) { return local_read(&cpu_buffer->entries) - (local_read(&cpu_buffer->overrun) + cpu_buffer->read); } /** * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) { unsigned long flags; struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; u64 ret = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * if the tail is on reader_page, oldest time stamp is on the reader * page */ if (cpu_buffer->tail_page == cpu_buffer->reader_page) bpage = cpu_buffer->reader_page; else bpage = rb_set_head_page(cpu_buffer); if (bpage) ret = bpage->page->time_stamp; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); /** * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; return ret; } EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to get the entries from. */ unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; return rb_num_of_entries(cpu_buffer); } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); /** * ring_buffer_overrun_cpu - get the number of overruns caused by the ring * buffer wrapping around (only if RB_FL_OVERWRITE is on). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** * ring_buffer_commit_overrun_cpu - get the number of overruns caused by * commits failing due to the buffer wrapping around while there are uncommitted * events, such as during an interrupt storm. * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->commit_overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); /** * ring_buffer_dropped_events_cpu - get the number of dropped events caused by * the ring buffer filling up (only if RB_FL_OVERWRITE is off). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->dropped_events); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); /** * ring_buffer_read_events_cpu - get the number of events successfully read * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of events read */ unsigned long ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; return cpu_buffer->read; } EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer * * Returns the total number of entries in the ring buffer * (all CPU entries) */ unsigned long ring_buffer_entries(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long entries = 0; int cpu; /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += rb_num_of_entries(cpu_buffer); } return entries; } EXPORT_SYMBOL_GPL(ring_buffer_entries); /** * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer * (all CPU entries) */ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long overruns = 0; int cpu; /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; overruns += local_read(&cpu_buffer->overrun); } return overruns; } EXPORT_SYMBOL_GPL(ring_buffer_overruns); static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; iter->next_event = iter->head; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; iter->cache_pages_removed = cpu_buffer->pages_removed; if (iter->head) { iter->read_stamp = cpu_buffer->read_stamp; iter->page_stamp = cpu_buffer->reader_page->page->time_stamp; } else { iter->read_stamp = iter->head_page->page->time_stamp; iter->page_stamp = iter->read_stamp; } } /** * ring_buffer_iter_reset - reset an iterator * @iter: The iterator to reset * * Resets the iterator, so that it will start from the beginning * again. */ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; if (!iter) return; cpu_buffer = iter->cpu_buffer; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); /** * ring_buffer_iter_empty - check if an iterator has no more to read * @iter: The iterator to check */ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *reader; struct buffer_page *head_page; struct buffer_page *commit_page; struct buffer_page *curr_commit_page; unsigned commit; u64 curr_commit_ts; u64 commit_ts; cpu_buffer = iter->cpu_buffer; reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; commit_page = READ_ONCE(cpu_buffer->commit_page); commit_ts = commit_page->page->time_stamp; /* * When the writer goes across pages, it issues a cmpxchg which * is a mb(), which will synchronize with the rmb here. * (see rb_tail_page_update()) */ smp_rmb(); commit = rb_page_commit(commit_page); /* We want to make sure that the commit page doesn't change */ smp_rmb(); /* Make sure commit page didn't change */ curr_commit_page = READ_ONCE(cpu_buffer->commit_page); curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp); /* If the commit page changed, then there's more data */ if (curr_commit_page != commit_page || curr_commit_ts != commit_ts) return 0; /* Still racy, as it may return a false positive, but that's OK */ return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && iter->head == rb_page_size(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); static void rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { u64 delta; switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp); cpu_buffer->read_stamp = delta; return; case RINGBUF_TYPE_DATA: cpu_buffer->read_stamp += event->time_delta; return; default: RB_WARN_ON(cpu_buffer, 1); } } static void rb_update_iter_read_stamp(struct ring_buffer_iter *iter, struct ring_buffer_event *event) { u64 delta; switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, iter->read_stamp); iter->read_stamp = delta; return; case RINGBUF_TYPE_DATA: iter->read_stamp += event->time_delta; return; default: RB_WARN_ON(iter->cpu_buffer, 1); } } static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); unsigned long overwrite; unsigned long flags; int nr_loops = 0; bool ret; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); again: /* * This should normally only loop twice. But because the * start of the reader inserts an empty page, it causes * a case where we will loop three times. There should be no * reason to loop four times (that I know of). */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { reader = NULL; goto out; } reader = cpu_buffer->reader_page; /* If there's more to read, return this page */ if (cpu_buffer->reader_page->read < rb_page_size(reader)) goto out; /* Never should we have an index greater than the size */ if (RB_WARN_ON(cpu_buffer, cpu_buffer->reader_page->read > rb_page_size(reader))) goto out; /* check if we caught up to the tail */ reader = NULL; if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; /* Don't bother swapping if the ring buffer is empty */ if (rb_num_of_entries(cpu_buffer) == 0) goto out; /* * Reset the reader page to size zero. */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->real_end = 0; spin: /* * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); if (!reader) goto out; cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; /* * cpu_buffer->pages just needs to point to the buffer, it * has no specific buffer page to point to. Lets move it out * of our way so we don't accidentally swap it. */ cpu_buffer->pages = reader->list.prev; /* The reader page will be pointing to the new head */ rb_set_list_to_head(&cpu_buffer->reader_page->list); /* * We want to make sure we read the overruns after we set up our * pointers to the next object. The writer side does a * cmpxchg to cross pages which acts as the mb on the writer * side. Note, the reader will constantly fail the swap * while the writer is updating the pointers, so this * guarantees that the overwrite recorded here is the one we * want to compare with the last_overrun. */ smp_mb(); overwrite = local_read(&(cpu_buffer->overrun)); /* * Here's the tricky part. * * We need to move the pointer past the header page. * But we can only do that if a writer is not currently * moving it. The page before the header page has the * flag bit '1' set if it is pointing to the page we want. * but if the writer is in the process of moving it * than it will be '2' or already moved '0'. */ ret = rb_head_page_replace(reader, cpu_buffer->reader_page); /* * If we did not convert it, then we must try again. */ if (!ret) goto spin; if (cpu_buffer->ring_meta) rb_update_meta_reader(cpu_buffer, reader); /* * Yay! We succeeded in replacing the page. * * Now make the new head point back to the reader page. */ rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(&cpu_buffer->head_page); cpu_buffer->cnt++; local_inc(&cpu_buffer->pages_read); /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; cpu_buffer->reader_page->read = 0; if (overwrite != cpu_buffer->last_overrun) { cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; cpu_buffer->last_overrun = overwrite; } goto again; out: /* Update the read_stamp on the first event */ if (reader && reader->read == 0) cpu_buffer->read_stamp = reader->page->time_stamp; arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); /* * The writer has preempt disable, wait for it. But not forever * Although, 1 second is pretty much "forever" */ #define USECS_WAIT 1000000 for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { /* If the write is past the end of page, a writer is still updating it */ if (likely(!reader || rb_page_write(reader) <= bsize)) break; udelay(1); /* Get the latest version of the reader write value */ smp_rmb(); } /* The writer is not moving forward? Something is wrong */ if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT)) reader = NULL; /* * Make sure we see any padding after the write update * (see rb_reset_tail()). * * In addition, a writer may be writing on the reader page * if the page has not been fully filled, so the read barrier * is also needed to make sure we see the content of what is * committed by the writer (see rb_set_commit_to_write()). */ smp_rmb(); return reader; } static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; struct buffer_page *reader; unsigned length; reader = rb_get_reader_page(cpu_buffer); /* This function should not be called when buffer is empty */ if (RB_WARN_ON(cpu_buffer, !reader)) return; event = rb_reader_event(cpu_buffer); if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); length = rb_event_length(event); cpu_buffer->reader_page->read += length; cpu_buffer->read_bytes += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; cpu_buffer = iter->cpu_buffer; /* If head == next_event then we need to jump to the next event */ if (iter->head == iter->next_event) { /* If the event gets overwritten again, there's nothing to do */ if (rb_iter_head_event(iter) == NULL) return; } iter->head = iter->next_event; /* * Check if we are at the end of the buffer. */ if (iter->next_event >= rb_page_size(iter->head_page)) { /* discarded commits can make the page empty */ if (iter->head_page == cpu_buffer->commit_page) return; rb_inc_iter(iter); return; } rb_update_iter_read_stamp(iter, iter->event); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) { return cpu_buffer->lost_events; } static struct ring_buffer_event * rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; struct buffer_page *reader; int nr_loops = 0; if (ts) *ts = 0; again: /* * We repeat when a time extend is encountered. * Since the time extend is always attached to a data event, * we should never loop more than once. * (We never hit the following condition more than twice). */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; reader = rb_get_reader_page(cpu_buffer); if (!reader) return NULL; event = rb_reader_event(cpu_buffer); switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) RB_WARN_ON(cpu_buffer, 1); /* * Because the writer could be discarding every * event it creates (which would probably be bad) * if we were to go back to "again" then we may never * catch up, and will trigger the warn on, or lock * the box. Return the padding, and we will release * the current locks, and try again. */ return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_DATA: if (ts && !(*ts)) { *ts = cpu_buffer->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } if (lost_events) *lost_events = rb_lost_events(cpu_buffer); return event; default: RB_WARN_ON(cpu_buffer, 1); } return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_peek); static struct ring_buffer_event * rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct trace_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int nr_loops = 0; if (ts) *ts = 0; cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; /* * Check if someone performed a consuming read to the buffer * or removed some pages from the buffer. In these cases, * iterator was invalidated and we need to reset it. */ if (unlikely(iter->cache_read != cpu_buffer->read || iter->cache_reader_page != cpu_buffer->reader_page || iter->cache_pages_removed != cpu_buffer->pages_removed)) rb_iter_reset(iter); again: if (ring_buffer_iter_empty(iter)) return NULL; /* * As the writer can mess with what the iterator is trying * to read, just give up if we fail to get an event after * three tries. The iterator is not as reliable when reading * the ring buffer with an active write as the consumer is. * Do not warn if the three failures is reached. */ if (++nr_loops > 3) return NULL; if (rb_per_cpu_empty(cpu_buffer)) return NULL; if (iter->head >= rb_page_size(iter->head_page)) { rb_inc_iter(iter); goto again; } event = rb_iter_head_event(iter); if (!event) goto again; switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) { rb_inc_iter(iter); goto again; } rb_advance_iter(iter); return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_DATA: if (ts && !(*ts)) { *ts = iter->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(buffer, cpu_buffer->cpu, ts); } return event; default: RB_WARN_ON(cpu_buffer, 1); } return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer) { if (likely(!in_nmi())) { raw_spin_lock(&cpu_buffer->reader_lock); return true; } /* * If an NMI die dumps out the content of the ring buffer * trylock must be used to prevent a deadlock if the NMI * preempted a task that holds the ring buffer locks. If * we get the lock then all is fine, if not, then continue * to do the read, but this can corrupt the ring buffer, * so it must be permanently disabled from future writes. * Reading from NMI is a oneshot deal. */ if (raw_spin_trylock(&cpu_buffer->reader_lock)) return true; /* Continue without locking, but disable the ring buffer */ atomic_inc(&cpu_buffer->record_disabled); return false; } static inline void rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked) { if (likely(locked)) raw_spin_unlock(&cpu_buffer->reader_lock); } /** * ring_buffer_peek - peek at the next event to be read * @buffer: The ring buffer to read * @cpu: The cpu to peak at * @ts: The timestamp counter of this event. * @lost_events: a variable to store if events were lost (may be NULL) * * This will return the event that will be read next, but does * not consume the data. */ struct ring_buffer_event * ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; bool dolock; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; again: local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } /** ring_buffer_iter_dropped - report if there are dropped events * @iter: The ring buffer iterator * * Returns true if there was dropped events since the last peek. */ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) { bool ret = iter->missed_events != 0; iter->missed_events = 0; return ret; } EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); /** * ring_buffer_iter_peek - peek at the next event to be read * @iter: The ring buffer iterator * @ts: The timestamp counter of this event. * * This will return the event that will be read next, but does * not increment the iterator. */ struct ring_buffer_event * ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; struct ring_buffer_event *event; unsigned long flags; again: raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } /** * ring_buffer_consume - return an event and consume it * @buffer: The ring buffer to get the next event from * @cpu: the cpu to read the buffer from * @ts: a variable to store the timestamp (may be NULL) * @lost_events: a variable to store if events were lost (may be NULL) * * Returns the next event in the ring buffer, and that event is consumed. * Meaning, that sequential reads will keep returning a different event, * and eventually empty the ring buffer if the producer is slower. */ struct ring_buffer_event * ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; unsigned long flags; bool dolock; again: /* might be called in atomic */ preempt_disable(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { cpu_buffer->lost_events = 0; rb_advance_reader(cpu_buffer); } rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); out: preempt_enable(); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } EXPORT_SYMBOL_GPL(ring_buffer_consume); /** * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate * through the buffer. Memory is allocated, buffer resizing * is disabled, and the iterator pointer is returned to the caller. * * After a sequence of ring_buffer_read_prepare calls, the user is * expected to make at least one call to ring_buffer_read_prepare_sync. * Afterwards, ring_buffer_read_start is invoked to get things going * for real. * * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; iter = kzalloc(sizeof(*iter), flags); if (!iter) return NULL; /* Holds the entire event: data and meta data */ iter->event_size = buffer->subbuf_size; iter->event = kmalloc(iter->event_size, flags); if (!iter->event) { kfree(iter); return NULL; } cpu_buffer = buffer->buffers[cpu]; iter->cpu_buffer = cpu_buffer; atomic_inc(&cpu_buffer->resize_disabled); return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); /** * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls * * All previously invoked ring_buffer_read_prepare calls to prepare * iterators will be synchronized. Afterwards, read_buffer_read_start * calls on those iterators are allowed. */ void ring_buffer_read_prepare_sync(void) { synchronize_rcu(); } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); /** * ring_buffer_read_start - start a non consuming read of the buffer * @iter: The iterator returned by ring_buffer_read_prepare * * This finalizes the startup of an iteration through the buffer. * The iterator comes from a call to ring_buffer_read_prepare and * an intervening ring_buffer_read_prepare_sync must have been * performed. * * Must be paired with ring_buffer_read_finish. */ void ring_buffer_read_start(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; if (!iter) return; cpu_buffer = iter->cpu_buffer; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_read_start); /** * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * * This re-enables resizing of the buffer, and frees the iterator. */ void ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Use this opportunity to check the integrity of the ring buffer. */ rb_check_pages(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); kfree(iter->event); kfree(iter); } EXPORT_SYMBOL_GPL(ring_buffer_read_finish); /** * ring_buffer_iter_advance - advance the iterator to the next location * @iter: The ring buffer iterator * * Move the location of the iterator such that the next read will * be the next location of the iterator. */ void ring_buffer_iter_advance(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_advance_iter(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); /** * ring_buffer_size - return the size of the ring buffer (in bytes) * @buffer: The ring buffer. * @cpu: The CPU to get ring buffer size from. */ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); /** * ring_buffer_max_event_size - return the max data size of an event * @buffer: The ring buffer. * * Returns the maximum size an event can be. */ unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer) { /* If abs timestamp is requested, events have a timestamp too */ if (ring_buffer_time_stamp_abs(buffer)) return buffer->max_data_size - RB_LEN_TIME_EXTEND; return buffer->max_data_size; } EXPORT_SYMBOL_GPL(ring_buffer_max_event_size); static void rb_clear_buffer_page(struct buffer_page *page) { local_set(&page->write, 0); local_set(&page->entries, 0); rb_init_page(page->page); page->read = 0; } static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; if (!meta) return; meta->reader.read = cpu_buffer->reader_page->read; meta->reader.id = cpu_buffer->reader_page->id; meta->reader.lost_events = cpu_buffer->lost_events; meta->entries = local_read(&cpu_buffer->entries); meta->overrun = local_read(&cpu_buffer->overrun); meta->read = cpu_buffer->read; /* Some archs do not have data cache coherency between kernel and user-space */ flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); } static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); rb_clear_buffer_page(cpu_buffer->head_page); list_for_each_entry(page, cpu_buffer->pages, list) { rb_clear_buffer_page(page); } cpu_buffer->tail_page = cpu_buffer->head_page; cpu_buffer->commit_page = cpu_buffer->head_page; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); rb_clear_buffer_page(cpu_buffer->reader_page); local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->commit_overrun, 0); local_set(&cpu_buffer->dropped_events, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); local_set(&cpu_buffer->pages_touched, 0); local_set(&cpu_buffer->pages_lost, 0); local_set(&cpu_buffer->pages_read, 0); cpu_buffer->last_pages_touch = 0; cpu_buffer->shortest_full = 0; cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; rb_time_set(&cpu_buffer->write_stamp, 0); rb_time_set(&cpu_buffer->before_stamp, 0); memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp)); cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; if (cpu_buffer->mapped) { rb_update_meta_page(cpu_buffer); if (cpu_buffer->ring_meta) { struct ring_buffer_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = meta->head_buffer; } } } /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); out: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of * @cpu: The CPU buffer to be reset */ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); /* Make sure all commits have finished */ synchronize_rcu(); reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); /* Make sure persistent meta now uses this buffer's addresses */ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); if (meta) rb_meta_init_text_addr(meta); mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /* Flag to ensure proper resetting of atomic variables */ #define RESET_BIT (1 << 30) /** * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of */ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); for_each_online_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; atomic_add(RESET_BIT, &cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); } /* Make sure all commits have finished */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; /* * If a CPU came online during the synchronize_rcu(), then * ignore it. */ if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT)) continue; reset_disabled_cpu_buffer(cpu_buffer); /* Make sure persistent meta now uses this buffer's addresses */ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); if (meta) rb_meta_init_text_addr(meta); atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } mutex_unlock(&buffer->mutex); } /** * ring_buffer_reset - reset a ring buffer * @buffer: The ring buffer to reset all cpu buffers */ void ring_buffer_reset(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); } /* Make sure all commits have finished */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); } mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset); /** * ring_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ bool ring_buffer_empty(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; bool dolock; bool ret; int cpu; /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (!ret) return false; } return true; } EXPORT_SYMBOL_GPL(ring_buffer_empty); /** * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? * @buffer: The ring buffer * @cpu: The CPU buffer to test */ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; bool dolock; bool ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return true; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers * @buffer_a: One buffer to swap with * @buffer_b: The other buffer to swap with * @cpu: the CPU of the buffers to swap * * This function is useful for tracers that want to take a "snapshot" * of a CPU buffer and has another back up buffer lying around. * it is expected that the tracer handles the cpu buffer not being * used at the moment. */ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, struct trace_buffer *buffer_b, int cpu) { struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; int ret = -EINVAL; if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) goto out; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* It's up to the callers to not try to swap mapped buffers */ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { ret = -EBUSY; goto out; } /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; if (buffer_a->subbuf_order != buffer_b->subbuf_order) goto out; ret = -EAGAIN; if (atomic_read(&buffer_a->record_disabled)) goto out; if (atomic_read(&buffer_b->record_disabled)) goto out; if (atomic_read(&cpu_buffer_a->record_disabled)) goto out; if (atomic_read(&cpu_buffer_b->record_disabled)) goto out; /* * We can't do a synchronize_rcu here because this * function can be called in atomic context. * Normally this will be called from the same CPU as cpu. * If not it's up to the caller to protect this. */ atomic_inc(&cpu_buffer_a->record_disabled); atomic_inc(&cpu_buffer_b->record_disabled); ret = -EBUSY; if (local_read(&cpu_buffer_a->committing)) goto out_dec; if (local_read(&cpu_buffer_b->committing)) goto out_dec; /* * When resize is in progress, we cannot swap it because * it will mess the state of the cpu buffer. */ if (atomic_read(&buffer_a->resizing)) goto out_dec; if (atomic_read(&buffer_b->resizing)) goto out_dec; buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; cpu_buffer_b->buffer = buffer_a; cpu_buffer_a->buffer = buffer_b; ret = 0; out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); #endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. * @cpu: the cpu buffer to allocate. * * This function is used in conjunction with ring_buffer_read_page. * When reading a full page from the ring buffer, these functions * can be used to speed up the process. The calling function should * allocate a few pages first with this function. Then when it * needs to get pages from the ring buffer, it passes the result * of this function into ring_buffer_read_page, which will swap * the page that was allocated, with the read page of the buffer. * * Returns: * The page allocated, or ERR_PTR */ struct buffer_data_read_page * ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_read_page *bpage = NULL; unsigned long flags; struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); bpage = kzalloc(sizeof(*bpage), GFP_KERNEL); if (!bpage) return ERR_PTR(-ENOMEM); bpage->order = buffer->subbuf_order; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (cpu_buffer->free_page) { bpage->data = cpu_buffer->free_page; cpu_buffer->free_page = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); if (bpage->data) goto out; page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) { kfree(bpage); return ERR_PTR(-ENOMEM); } bpage->data = page_address(page); out: rb_init_page(bpage->data); return bpage; } EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for * @cpu: the cpu buffer the page came from * @data_page: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, struct buffer_data_read_page *data_page) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = data_page->data; struct page *page = virt_to_page(bpage); unsigned long flags; if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) return; cpu_buffer = buffer->buffers[cpu]; /* * If the page is still in use someplace else, or order of the page * is different from the subbuffer order of the buffer - * we can't reuse it */ if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order) goto out; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (!cpu_buffer->free_page) { cpu_buffer->free_page = bpage; bpage = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); out: free_pages((unsigned long)bpage, data_page->order); kfree(data_page); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer * @buffer: buffer to extract from * @data_page: the page to use allocated from ring_buffer_alloc_read_page * @len: amount to extract * @cpu: the cpu of the buffer to extract * @full: should the extraction only happen when the page is full. * * This function will pull out a page from the ring buffer and consume it. * @data_page must be the address of the variable that was returned * from ring_buffer_alloc_read_page. This is because the page might be used * to swap with a page in the ring buffer. * * for example: * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (IS_ERR(rpage)) * return PTR_ERR(rpage); * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0); * if (ret >= 0) * process_page(ring_buffer_read_page_data(rpage), ret); * ring_buffer_free_read_page(buffer, cpu, rpage); * * When @full is set, the function will not return true unless * the writer is off the reader page. * * Note: it is up to the calling functions to handle sleeps and wakeups. * The ring buffer can be used anywhere in the kernel and can not * blindly call wake_up. The layer that uses the ring buffer must be * responsible for that. * * Returns: * >=0 if data has been transferred, returns the offset of consumed data. * <0 if no data has been transferred. */ int ring_buffer_read_page(struct trace_buffer *buffer, struct buffer_data_read_page *data_page, size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) goto out; len -= BUF_PAGE_HDR_SIZE; if (!data_page || !data_page->data) goto out; if (data_page->order != buffer->subbuf_order) goto out; bpage = data_page->data; if (!bpage) goto out; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) goto out_unlock; event = rb_reader_event(cpu_buffer); read = reader->read; commit = rb_page_size(reader); /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; /* * If this page has been partially read or * if len is not big enough to read the rest of the page or * a writer is still on the page, then * we must copy the data from the page to the buffer. * Otherwise, we can simply swap the page with the one passed in. */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || cpu_buffer->mapped) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; unsigned int size; /* * If a full page is expected, this can still be returned * if there's been a previous partial read and the * rest of the page can be read and the commit page is off * the reader page. */ if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) goto out_unlock; if (len > (commit - read)) len = (commit - read); /* Always keep the time extend and data together */ size = rb_event_ts_length(event); if (len < size) goto out_unlock; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; /* Need to copy one event at a time */ do { /* We need the size of one event, because * rb_advance_reader only advances by one event, * whereas rb_event_ts_length may include the size of * one or two events. * We have already ensured there's enough space if this * is a time extend. */ size = rb_event_length(event); memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; rb_advance_reader(cpu_buffer); rpos = reader->read; pos += size; if (rpos >= commit) break; event = rb_reader_event(cpu_buffer); /* Always keep the time extend and data together */ size = rb_event_ts_length(event); } while (len >= size); /* update bpage */ local_set(&bpage->commit, pos); bpage->time_stamp = save_timestamp; /* we copied everything to the beginning */ read = 0; } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); cpu_buffer->read_bytes += rb_page_size(reader); /* swap the pages */ rb_init_page(bpage); bpage = reader->page; reader->page = data_page->data; local_set(&reader->write, 0); local_set(&reader->entries, 0); reader->read = 0; data_page->data = bpage; /* * Use the real_end for the data size, * This gives us a chance to store the lost events * on the page. */ if (reader->real_end) local_set(&bpage->commit, reader->real_end); } ret = read; cpu_buffer->lost_events = 0; commit = local_read(&bpage->commit); /* * Set a flag in the commit field if we lost events */ if (missed_events) { /* If there is room at the end of the page to save the * missed events, then record it there. */ if (buffer->subbuf_size - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); commit += sizeof(missed_events); } local_add(RB_MISSED_EVENTS, &bpage->commit); } /* * This page may be off to user land. Zero it out here. */ if (commit < buffer->subbuf_size) memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); out_unlock: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); /** * ring_buffer_read_page_data - get pointer to the data in the page. * @page: the page to get the data from * * Returns pointer to the actual data in this page. */ void *ring_buffer_read_page_data(struct buffer_data_read_page *page) { return page->data; } EXPORT_SYMBOL_GPL(ring_buffer_read_page_data); /** * ring_buffer_subbuf_size_get - get size of the sub buffer. * @buffer: the buffer to get the sub buffer size from * * Returns size of the sub buffer, in bytes. */ int ring_buffer_subbuf_size_get(struct trace_buffer *buffer) { return buffer->subbuf_size + BUF_PAGE_HDR_SIZE; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get); /** * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page. * @buffer: The ring_buffer to get the system sub page order from * * By default, one ring buffer sub page equals to one system page. This parameter * is configurable, per ring buffer. The size of the ring buffer sub page can be * extended, but must be an order of system page size. * * Returns the order of buffer sub page size, in system pages: * 0 means the sub buffer size is 1 system page and so forth. * In case of an error < 0 is returned. */ int ring_buffer_subbuf_order_get(struct trace_buffer *buffer) { if (!buffer) return -EINVAL; return buffer->subbuf_order; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); /** * ring_buffer_subbuf_order_set - set the size of ring buffer sub page. * @buffer: The ring_buffer to set the new page size. * @order: Order of the system pages in one sub buffer page * * By default, one ring buffer pages equals to one system page. This API can be * used to set new size of the ring buffer page. The size must be order of * system page size, that's why the input parameter @order is the order of * system pages that are allocated for one ring buffer page: * 0 - 1 system page * 1 - 2 system pages * 3 - 4 system pages * ... * * Returns 0 on success or < 0 in case of an error. */ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage, *tmp; int old_order, old_size; int nr_pages; int psize; int err; int cpu; if (!buffer || order < 0) return -EINVAL; if (buffer->subbuf_order == order) return 0; psize = (1 << order) * PAGE_SIZE; if (psize <= BUF_PAGE_HDR_SIZE) return -EINVAL; /* Size of a subbuf cannot be greater than the write counter */ if (psize > RB_WRITE_MASK + 1) return -EINVAL; old_order = buffer->subbuf_order; old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ synchronize_rcu(); buffer->subbuf_order = order; buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE; /* Make sure all new buffers are allocated, before deleting the old ones */ for_each_buffer_cpu(buffer, cpu) { if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; if (cpu_buffer->mapped) { err = -EBUSY; goto error; } /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) nr_pages = 2; cpu_buffer->nr_pages_to_update = nr_pages; /* Include the reader page */ nr_pages++; /* Allocate the new size buffer */ INIT_LIST_HEAD(&cpu_buffer->new_pages); if (__rb_allocate_pages(cpu_buffer, nr_pages, &cpu_buffer->new_pages)) { /* not enough memory for new pages */ err = -ENOMEM; goto error; } } for_each_buffer_cpu(buffer, cpu) { struct buffer_data_page *old_free_data_page; struct list_head old_pages; unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); /* * Collect buffers from the cpu_buffer pages list and the * reader_page on old_pages, so they can be freed later when not * under a spinlock. The pages list is a linked list with no * head, adding old_pages turns it into a regular list with * old_pages being the head. */ list_add(&old_pages, cpu_buffer->pages); list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; list_del_init(&cpu_buffer->new_pages); cpu_buffer->cnt++; cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); /* Free old sub buffers */ list_for_each_entry_safe(bpage, tmp, &old_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } free_pages((unsigned long)old_free_data_page, old_order); rb_check_pages(cpu_buffer); } atomic_dec(&buffer->record_disabled); mutex_unlock(&buffer->mutex); return 0; error: buffer->subbuf_order = old_order; buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } } return err; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct page *page; if (cpu_buffer->meta_page) return 0; page = alloc_page(GFP_USER | __GFP_ZERO); if (!page) return -ENOMEM; cpu_buffer->meta_page = page_to_virt(page); return 0; } static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long addr = (unsigned long)cpu_buffer->meta_page; free_page(addr); cpu_buffer->meta_page = NULL; } static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, unsigned long *subbuf_ids) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; struct buffer_page *first_subbuf, *subbuf; int id = 0; subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; cpu_buffer->reader_page->id = id++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); do { if (WARN_ON(id >= nr_subbufs)) break; subbuf_ids[id] = (unsigned long)subbuf->page; subbuf->id = id; rb_inc_page(&subbuf); id++; } while (subbuf != first_subbuf); /* install subbuf ID to kern VA translation */ cpu_buffer->subbuf_ids = subbuf_ids; meta->meta_struct_len = sizeof(*meta); meta->nr_subbufs = nr_subbufs; meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; meta->meta_page_size = meta->subbuf_size; rb_update_meta_page(cpu_buffer); } static struct ring_buffer_per_cpu * rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-EINVAL); cpu_buffer = buffer->buffers[cpu]; mutex_lock(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { mutex_unlock(&cpu_buffer->mapping_lock); return ERR_PTR(-ENODEV); } return cpu_buffer; } static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) { mutex_unlock(&cpu_buffer->mapping_lock); } /* * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need * to be set-up or torn-down. */ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, bool inc) { unsigned long flags; lockdep_assert_held(&cpu_buffer->mapping_lock); /* mapped is always greater or equal to user_mapped */ if (WARN_ON(cpu_buffer->mapped < cpu_buffer->user_mapped)) return -EINVAL; if (inc && cpu_buffer->mapped == UINT_MAX) return -EBUSY; if (WARN_ON(!inc && cpu_buffer->user_mapped == 0)) return -EINVAL; mutex_lock(&cpu_buffer->buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (inc) { cpu_buffer->user_mapped++; cpu_buffer->mapped++; } else { cpu_buffer->user_mapped--; cpu_buffer->mapped--; } raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); mutex_unlock(&cpu_buffer->buffer->mutex); return 0; } /* * +--------------+ pgoff == 0 * | meta page | * +--------------+ pgoff == 1 * | subbuffer 0 | * | | * +--------------+ pgoff == (1 + (1 << subbuf_order)) * | subbuffer 1 | * | | * ... */ #ifdef CONFIG_MMU static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct vm_area_struct *vma) { unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; struct page **pages; int p = 0, s = 0; int err; /* Refuse MP_PRIVATE or writable mappings */ if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || !(vma->vm_flags & VM_MAYSHARE)) return -EPERM; subbuf_order = cpu_buffer->buffer->subbuf_order; subbuf_pages = 1 << subbuf_order; if (subbuf_order && pgoff % subbuf_pages) return -EINVAL; /* * Make sure the mapping cannot become writable later. Also tell the VM * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). */ vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP, VM_MAYWRITE); lockdep_assert_held(&cpu_buffer->mapping_lock); nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */ nr_vma_pages = vma_pages(vma); if (!nr_vma_pages || nr_vma_pages > nr_pages) return -EINVAL; nr_pages = nr_vma_pages; pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); if (!pages) return -ENOMEM; if (!pgoff) { unsigned long meta_page_padding; pages[p++] = virt_to_page(cpu_buffer->meta_page); /* * Pad with the zero-page to align the meta-page with the * sub-buffers. */ meta_page_padding = subbuf_pages - 1; while (meta_page_padding-- && p < nr_pages) { unsigned long __maybe_unused zero_addr = vma->vm_start + (PAGE_SIZE * p); pages[p++] = ZERO_PAGE(zero_addr); } } else { /* Skip the meta-page */ pgoff -= subbuf_pages; s += pgoff / subbuf_pages; } while (p < nr_pages) { struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); int off = 0; if (WARN_ON_ONCE(s >= nr_subbufs)) { err = -EINVAL; goto out; } for (; off < (1 << (subbuf_order)); off++, page++) { if (p >= nr_pages) break; pages[p++] = page; } s++; } err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); out: kfree(pages); return err; } #else static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct vm_area_struct *vma) { return -EOPNOTSUPP; } #endif int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags, *subbuf_ids; int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; mutex_lock(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); mutex_unlock(&cpu_buffer->mapping_lock); return err; } /* prevent another thread from changing buffer/sub-buffer sizes */ mutex_lock(&buffer->mutex); err = rb_alloc_meta_page(cpu_buffer); if (err) goto unlock; /* subbuf_ids include the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); err = -ENOMEM; goto unlock; } atomic_inc(&cpu_buffer->resize_disabled); /* * Lock all readers to block any subbuf swap until the subbuf IDs are * assigned. */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); err = __rb_map_vma(cpu_buffer, vma); if (!err) { raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the first time it is mapped by user */ cpu_buffer->mapped++; cpu_buffer->user_mapped = 1; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } else { kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); } unlock: mutex_unlock(&buffer->mutex); mutex_unlock(&cpu_buffer->mapping_lock); return err; } int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; mutex_lock(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { err = -ENODEV; goto out; } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); goto out; } mutex_lock(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the last user space mapping */ if (!WARN_ON_ONCE(cpu_buffer->mapped < cpu_buffer->user_mapped)) cpu_buffer->mapped--; cpu_buffer->user_mapped = 0; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); mutex_unlock(&buffer->mutex); out: mutex_unlock(&cpu_buffer->mapping_lock); return err; } int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *reader; unsigned long missed_events; unsigned long reader_size; unsigned long flags; cpu_buffer = rb_get_mapped_buffer(buffer, cpu); if (IS_ERR(cpu_buffer)) return (int)PTR_ERR(cpu_buffer); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); consume: if (rb_per_cpu_empty(cpu_buffer)) goto out; reader_size = rb_page_size(cpu_buffer->reader_page); /* * There are data to be read on the current reader page, we can * return to the caller. But before that, we assume the latter will read * everything. Let's update the kernel reader accordingly. */ if (cpu_buffer->reader_page->read < reader_size) { while (cpu_buffer->reader_page->read < reader_size) rb_advance_reader(cpu_buffer); goto out; } reader = rb_get_reader_page(cpu_buffer); if (WARN_ON(!reader)) goto out; /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; if (cpu_buffer->reader_page != cpu_buffer->commit_page) { if (missed_events) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* * Use the real_end for the data size, * This gives us a chance to store the lost events * on the page. */ if (reader->real_end) local_set(&bpage->commit, reader->real_end); /* * If there is room at the end of the page to save the * missed events, then record it there. */ commit = rb_page_size(reader); if (buffer->subbuf_size - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); } } else { /* * There really shouldn't be any missed events if the commit * is on the reader page. */ WARN_ON_ONCE(missed_events); } cpu_buffer->lost_events = 0; goto consume; out: /* Some archs do not have data cache coherency between kernel and user-space */ flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); rb_update_meta_page(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); rb_put_mapped_buffer(cpu_buffer); return 0; } /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in * the buffer. */ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) { struct trace_buffer *buffer; long nr_pages_same; int cpu_i; unsigned long nr_pages; buffer = container_of(node, struct trace_buffer, node); if (cpumask_test_cpu(cpu, buffer->cpumask)) return 0; nr_pages = 0; nr_pages_same = 1; /* check if all cpu sizes are same */ for_each_buffer_cpu(buffer, cpu_i) { /* fill in the size from first enabled cpu */ if (nr_pages == 0) nr_pages = buffer->buffers[cpu_i]->nr_pages; if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { nr_pages_same = 0; break; } } /* allocate minimum pages, user can later expand it */ if (!nr_pages_same) nr_pages = 2; buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) { WARN(1, "failed to allocate ring buffer on CPU %u\n", cpu); return -ENOMEM; } smp_wmb(); cpumask_set_cpu(cpu, buffer->cpumask); return 0; } #ifdef CONFIG_RING_BUFFER_STARTUP_TEST /* * This is a basic integrity check of the ring buffer. * Late in the boot cycle this test will run when configured in. * It will kick off a thread per CPU that will go into a loop * writing to the per cpu ring buffer various sizes of data. * Some of the data will be large items, some small. * * Another thread is created that goes into a spin, sending out * IPIs to the other CPUs to also write into the ring buffer. * this is to test the nesting ability of the buffer. * * Basic stats are recorded and reported. If something in the * ring buffer should happen that's not expected, a big warning * is displayed and all ring buffers are disabled. */ static struct task_struct *rb_threads[NR_CPUS] __initdata; struct rb_test_data { struct trace_buffer *buffer; unsigned long events; unsigned long bytes_written; unsigned long bytes_alloc; unsigned long bytes_dropped; unsigned long events_nested; unsigned long bytes_written_nested; unsigned long bytes_alloc_nested; unsigned long bytes_dropped_nested; int min_size_nested; int max_size_nested; int max_size; int min_size; int cpu; int cnt; }; static struct rb_test_data rb_data[NR_CPUS] __initdata; /* 1 meg per cpu */ #define RB_TEST_BUFFER_SIZE 1048576 static char rb_string[] __initdata = "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; static bool rb_test_started __initdata; struct rb_item { int size; char str[]; }; static __init int rb_write_something(struct rb_test_data *data, bool nested) { struct ring_buffer_event *event; struct rb_item *item; bool started; int event_len; int size; int len; int cnt; /* Have nested writes different that what is written */ cnt = data->cnt + (nested ? 27 : 0); /* Multiply cnt by ~e, to make some unique increment */ size = (cnt * 68 / 25) % (sizeof(rb_string) - 1); len = size + sizeof(struct rb_item); started = rb_test_started; /* read rb_test_started before checking buffer enabled */ smp_rmb(); event = ring_buffer_lock_reserve(data->buffer, len); if (!event) { /* Ignore dropped events before test starts. */ if (started) { if (nested) data->bytes_dropped += len; else data->bytes_dropped_nested += len; } return len; } event_len = ring_buffer_event_length(event); if (RB_WARN_ON(data->buffer, event_len < len)) goto out; item = ring_buffer_event_data(event); item->size = size; memcpy(item->str, rb_string, size); if (nested) { data->bytes_alloc_nested += event_len; data->bytes_written_nested += len; data->events_nested++; if (!data->min_size_nested || len < data->min_size_nested) data->min_size_nested = len; if (len > data->max_size_nested) data->max_size_nested = len; } else { data->bytes_alloc += event_len; data->bytes_written += len; data->events++; if (!data->min_size || len < data->min_size) data->max_size = len; if (len > data->max_size) data->max_size = len; } out: ring_buffer_unlock_commit(data->buffer); return 0; } static __init int rb_test(void *arg) { struct rb_test_data *data = arg; while (!kthread_should_stop()) { rb_write_something(data, false); data->cnt++; set_current_state(TASK_INTERRUPTIBLE); /* Now sleep between a min of 100-300us and a max of 1ms */ usleep_range(((data->cnt % 3) + 1) * 100, 1000); } return 0; } static __init void rb_ipi(void *ignore) { struct rb_test_data *data; int cpu = smp_processor_id(); data = &rb_data[cpu]; rb_write_something(data, true); } static __init int rb_hammer_test(void *arg) { while (!kthread_should_stop()) { /* Send an IPI to all cpus to write data! */ smp_call_function(rb_ipi, NULL, 1); /* No sleep, but for non preempt, let others run */ schedule(); } return 0; } static __init int test_ringbuffer(void) { struct task_struct *rb_hammer; struct trace_buffer *buffer; int cpu; int ret = 0; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Lockdown is enabled, skipping ring buffer tests\n"); return 0; } pr_info("Running ring buffer tests...\n"); buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); if (WARN_ON(!buffer)) return 0; /* Disable buffer so that threads can't write to it yet */ ring_buffer_record_off(buffer); for_each_online_cpu(cpu) { rb_data[cpu].buffer = buffer; rb_data[cpu].cpu = cpu; rb_data[cpu].cnt = cpu; rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu], cpu, "rbtester/%u"); if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_threads[cpu]); goto out_free; } } /* Now create the rb hammer! */ rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); if (WARN_ON(IS_ERR(rb_hammer))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_hammer); goto out_free; } ring_buffer_record_on(buffer); /* * Show buffer is enabled before setting rb_test_started. * Yes there's a small race window where events could be * dropped and the thread wont catch it. But when a ring * buffer gets enabled, there will always be some kind of * delay before other CPUs see it. Thus, we don't care about * those dropped events. We care about events dropped after * the threads see that the buffer is active. */ smp_wmb(); rb_test_started = true; set_current_state(TASK_INTERRUPTIBLE); /* Just run for 10 seconds */; schedule_timeout(10 * HZ); kthread_stop(rb_hammer); out_free: for_each_online_cpu(cpu) { if (!rb_threads[cpu]) break; kthread_stop(rb_threads[cpu]); } if (ret) { ring_buffer_free(buffer); return ret; } /* Report! */ pr_info("finished\n"); for_each_online_cpu(cpu) { struct ring_buffer_event *event; struct rb_test_data *data = &rb_data[cpu]; struct rb_item *item; unsigned long total_events; unsigned long total_dropped; unsigned long total_written; unsigned long total_alloc; unsigned long total_read = 0; unsigned long total_size = 0; unsigned long total_len = 0; unsigned long total_lost = 0; unsigned long lost; int big_event_size; int small_event_size; ret = -1; total_events = data->events + data->events_nested; total_written = data->bytes_written + data->bytes_written_nested; total_alloc = data->bytes_alloc + data->bytes_alloc_nested; total_dropped = data->bytes_dropped + data->bytes_dropped_nested; big_event_size = data->max_size + data->max_size_nested; small_event_size = data->min_size + data->min_size_nested; pr_info("CPU %d:\n", cpu); pr_info(" events: %ld\n", total_events); pr_info(" dropped bytes: %ld\n", total_dropped); pr_info(" alloced bytes: %ld\n", total_alloc); pr_info(" written bytes: %ld\n", total_written); pr_info(" biggest event: %d\n", big_event_size); pr_info(" smallest event: %d\n", small_event_size); if (RB_WARN_ON(buffer, total_dropped)) break; ret = 0; while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { total_lost += lost; item = ring_buffer_event_data(event); total_len += ring_buffer_event_length(event); total_size += item->size + sizeof(struct rb_item); if (memcmp(&item->str[0], rb_string, item->size) != 0) { pr_info("FAILED!\n"); pr_info("buffer had: %.*s\n", item->size, item->str); pr_info("expected: %.*s\n", item->size, rb_string); RB_WARN_ON(buffer, 1); ret = -1; break; } total_read++; } if (ret) break; ret = -1; pr_info(" read events: %ld\n", total_read); pr_info(" lost events: %ld\n", total_lost); pr_info(" total events: %ld\n", total_lost + total_read); pr_info(" recorded len bytes: %ld\n", total_len); pr_info(" recorded size bytes: %ld\n", total_size); if (total_lost) { pr_info(" With dropped events, record len and size may not match\n" " alloced and written from above\n"); } else { if (RB_WARN_ON(buffer, total_len != total_alloc || total_size != total_written)) break; } if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) break; ret = 0; } if (!ret) pr_info("Ring buffer PASSED!\n"); ring_buffer_free(buffer); return 0; } late_initcall(test_ringbuffer); #endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
16 16 23 2 1 2 1 17 16 16 16 7 9 16 16 16 16 16 16 16 7 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <linux/rculist.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/bpf_local_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> #include <linux/rcupdate.h> #include <linux/rcupdate_trace.h> #include <linux/rcupdate_wait.h> #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) static struct bpf_local_storage_map_bucket * select_bucket(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; } static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct bpf_map *map = &smap->map; if (!map->ops->map_local_storage_charge) return 0; return map->ops->map_local_storage_charge(smap, owner, size); } static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct bpf_map *map = &smap->map; if (map->ops->map_local_storage_uncharge) map->ops->map_local_storage_uncharge(smap, owner, size); } static struct bpf_local_storage __rcu ** owner_storage(struct bpf_local_storage_map *smap, void *owner) { struct bpf_map *map = &smap->map; return map->ops->map_owner_storage_ptr(owner); } static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed_lockless(&selem->snode); } static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->snode); } static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed_lockless(&selem->map_node); } static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->map_node); } struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; if (smap->bpf_ma) { migrate_disable(); selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); migrate_enable(); if (selem) /* Keep the original bpf_map_kzalloc behavior * before started using the bpf_mem_cache_alloc. * * No need to use zero_map_value. The bpf_selem_free() * only does bpf_mem_cache_free when there is * no other bpf prog is using the selem. */ memset(SDATA(selem)->data, 0, smap->map.value_size); } else { selem = bpf_map_kzalloc(&smap->map, smap->elem_size, gfp_flags | __GFP_NOWARN); } if (selem) { if (value) { /* No need to call check_and_init_map_value as memory is zero init */ copy_map_value(&smap->map, SDATA(selem)->data, value); if (swap_uptrs) bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value); } return selem; } if (charge_mem) mem_uncharge(smap, owner, smap->elem_size); return NULL; } /* rcu tasks trace callback for bpf_ma == false */ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; /* If RCU Tasks Trace grace period implies RCU grace period, do * kfree(), else do kfree_rcu(). */ local_storage = container_of(rcu, struct bpf_local_storage, rcu); if (rcu_trace_implies_rcu_gp()) kfree(local_storage); else kfree_rcu(local_storage, rcu); } static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; local_storage = container_of(rcu, struct bpf_local_storage, rcu); bpf_mem_cache_raw_free(local_storage); } static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { if (rcu_trace_implies_rcu_gp()) bpf_local_storage_free_rcu(rcu); else call_rcu(rcu, bpf_local_storage_free_rcu); } /* Handle bpf_ma == false */ static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, bool vanilla_rcu) { if (vanilla_rcu) kfree_rcu(local_storage, rcu); else call_rcu_tasks_trace(&local_storage->rcu, __bpf_local_storage_free_trace_rcu); } static void bpf_local_storage_free(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool bpf_ma, bool reuse_now) { if (!local_storage) return; if (!bpf_ma) { __bpf_local_storage_free(local_storage, reuse_now); return; } if (!reuse_now) { call_rcu_tasks_trace(&local_storage->rcu, bpf_local_storage_free_trace_rcu); return; } if (smap) { migrate_disable(); bpf_mem_cache_free(&smap->storage_ma, local_storage); migrate_enable(); } else { /* smap could be NULL if the selem that triggered * this 'local_storage' creation had been long gone. * In this case, directly do call_rcu(). */ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); } } /* rcu tasks trace callback for bpf_ma == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); if (rcu_trace_implies_rcu_gp()) kfree(selem); else kfree_rcu(selem, rcu); } /* Handle bpf_ma == false */ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, bool vanilla_rcu) { if (vanilla_rcu) kfree_rcu(selem, rcu); else call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); } static void bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); bpf_mem_cache_raw_free(selem); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) { if (rcu_trace_implies_rcu_gp()) bpf_selem_free_rcu(rcu); else call_rcu(rcu, bpf_selem_free_rcu); } void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, bool reuse_now) { if (!smap->bpf_ma) { /* Only task storage has uptrs and task storage * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true * for task storage, so this bpf_obj_free_fields() won't unpin * any uptr. */ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } if (reuse_now) { /* reuse_now == true only happens when the storage owner * (e.g. task_struct) is being destructed or the map itself * is being destructed (ie map_free). In both cases, * no bpf prog can have a hold on the selem. It is * safe to unpin the uptrs and free the selem now. */ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); /* Instead of using the vanilla call_rcu(), * bpf_mem_cache_free will be able to reuse selem * immediately. */ migrate_disable(); bpf_mem_cache_free(&smap->selem_ma, selem); migrate_enable(); return; } call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); } static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) { struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; struct hlist_node *n; /* The "_safe" iteration is needed. * The loop is not removing the selem from the list * but bpf_selem_free will use the selem->rcu_head * which is union-ized with the selem->free_node. */ hlist_for_each_entry_safe(selem, n, list, free_node) { smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); bpf_selem_free(selem, smap, reuse_now); } } /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, bool uncharge_mem, struct hlist_head *free_selem_list) { struct bpf_local_storage_map *smap; bool free_local_storage; void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); owner = local_storage->owner; /* All uncharging on the owner must be done first. * The owner may be freed once the last selem is unlinked * from local_storage. */ if (uncharge_mem) mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); if (free_local_storage) { mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); local_storage->owner = NULL; /* After this RCU_INIT, owner may be freed and cannot be used */ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); /* local_storage is not freed now. local_storage->lock is * still held and raw_spin_unlock_bh(&local_storage->lock) * will be done by the caller. * * Although the unlock will be done under * rcu_read_lock(), it is more intuitive to * read if the freeing of the storage is done * after the raw_spin_unlock_bh(&local_storage->lock). * * Hence, a "bool free_local_storage" is returned * to the caller which then calls then frees the storage after * all the RCU grace periods have expired. */ } hlist_del_init_rcu(&selem->snode); if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); hlist_add_head(&selem->free_node, free_selem_list); if (rcu_access_pointer(local_storage->smap) == smap) RCU_INIT_POINTER(local_storage->smap, NULL); return free_local_storage; } static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *storage_smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *selem_smap; /* local_storage->smap may be NULL. If it is, get the bpf_ma * from any selem in the local_storage->list. The bpf_ma of all * local_storage and selem should have the same value * for the same map type. * * If the local_storage->list is already empty, the caller will not * care about the bpf_ma value also because the caller is not * responsible to free the local_storage. */ if (storage_smap) return storage_smap->bpf_ma; if (!selem) { struct hlist_node *n; n = rcu_dereference_check(hlist_first_rcu(&local_storage->list), bpf_rcu_lock_held()); if (!n) return false; selem = hlist_entry(n, struct bpf_local_storage_elem, snode); } selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); return selem_smap->bpf_ma; } static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, bool reuse_now) { struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; bool bpf_ma, free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ return; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem); raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true, &selem_free_list); raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, reuse_now); if (free_local_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { RCU_INIT_POINTER(selem->local_storage, local_storage); hlist_add_head_rcu(&selem->snode, &local_storage->list); } static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; if (unlikely(!selem_linked_to_map_lockless(selem))) /* selem has already be unlinked from smap */ return; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, selem); raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); unsigned long flags; raw_spin_lock_irqsave(&b->lock, flags); RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ bpf_selem_unlink_map(selem); bpf_selem_unlink_storage(selem, reuse_now); } void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { unsigned long flags; /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ raw_spin_lock_irqsave(&local_storage->lock, flags); if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); raw_spin_unlock_irqrestore(&local_storage->lock, flags); } static int check_flags(const struct bpf_local_storage_data *old_sdata, u64 map_flags) { if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; return 0; } int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *first_selem, gfp_t gfp_flags) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; int err; err = mem_charge(smap, owner, sizeof(*storage)); if (err) return err; if (smap->bpf_ma) { migrate_disable(); storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); migrate_enable(); } else { storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), gfp_flags | __GFP_NOWARN); } if (!storage) { err = -ENOMEM; goto uncharge; } RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); raw_spin_lock_init(&storage->lock); storage->owner = owner; bpf_selem_link_storage_nolock(storage, first_selem); bpf_selem_link_map(smap, first_selem); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); /* Publish storage to the owner. * Instead of using any lock of the kernel object (i.e. owner), * cmpxchg will work with any kernel object regardless what * the running context is, bh, irq...etc. * * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage) * is protected by the storage->lock. Hence, when freeing * the owner->storage, the storage->lock must be held before * setting owner->storage ptr to NULL. */ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { bpf_selem_unlink_map(first_selem); err = -EAGAIN; goto uncharge; /* Note that even first_selem was linked to smap's * bucket->list, first_selem can be freed immediately * (instead of kfree_rcu) because * bpf_local_storage_map_free() does a * synchronize_rcu_mult (waiting for both sleepable and * normal programs) before walking the bucket->list. * Hence, no one is accessing selem from the * bucket->list under rcu_read_lock(). */ } return 0; uncharge: bpf_local_storage_free(storage, smap, smap->bpf_ma, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } /* sk cannot be going away because it is linking new elem * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). * Otherwise, it will become a leak (and other memory issues * during map destruction). */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; HLIST_HEAD(old_selem_free_list); unsigned long flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ unlikely((map_flags & BPF_F_LOCK) && !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) return ERR_PTR(-EINVAL); if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) return ERR_PTR(-EINVAL); local_storage = rcu_dereference_check(*owner_storage(smap, owner), bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { /* Very first elem for the owner */ err = check_flags(NULL, map_flags); if (err) return ERR_PTR(err); selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { bpf_selem_free(selem, smap, true); mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } return SDATA(selem); } if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { /* Hoping to find an old_sdata to do inline update * such that it can avoid taking the local_storage->lock * and changing the lists. */ old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) return ERR_PTR(err); if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); return old_sdata; } } /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { /* A parallel del is happening and local_storage is going * away. It has just been checked before, so very * unlikely. Return instead of retry to keep things * simple. */ err = -EAGAIN; goto unlock; } old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) goto unlock; if (old_sdata && (map_flags & BPF_F_LOCK)) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); selem = SELEM(old_sdata); goto unlock; } alloc_selem = NULL; /* First, link the new selem to the map */ bpf_selem_link_map(smap, selem); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); /* Third, remove old selem, SELEM(old_sdata) */ if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), true, &old_selem_free_list); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); bpf_selem_free(alloc_selem, smap, true); } return err ? ERR_PTR(err) : SDATA(selem); } static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) { u64 min_usage = U64_MAX; u16 i, res = 0; spin_lock(&cache->idx_lock); for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) { if (cache->idx_usage_counts[i] < min_usage) { min_usage = cache->idx_usage_counts[i]; res = i; /* Found a free cache_idx */ if (!min_usage) break; } } cache->idx_usage_counts[res]++; spin_unlock(&cache->idx_lock); return res; } static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, u16 idx) { spin_lock(&cache->idx_lock); cache->idx_usage_counts[idx]--; spin_unlock(&cache->idx_lock); } int bpf_local_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->max_entries || attr->key_size != sizeof(int) || !attr->value_size || /* Enforce BTF for userspace sk dumping */ !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) return -E2BIG; return 0; } int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { u32 int_data; if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; } void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; bool bpf_ma, free_storage = false; HLIST_HEAD(free_selem_list); struct hlist_node *n; unsigned long flags; storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL); /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. * Thus, no elem can be added to or deleted from the * local_storage->list by the bpf_prog or by the bpf_map's syscall. * * It is racing with bpf_local_storage_map_free() alone * when unlinking elem from the local_storage->list and * the map's bucket->list. */ raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. */ bpf_selem_unlink_map(selem); /* If local_storage list has only one element, the * bpf_selem_unlink_storage_nolock() will return true. * Otherwise, it will return false. The current loop iteration * intends to remove all local storage. So the last iteration * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true, &free_selem_list); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&free_selem_list, true); if (free_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) { struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map; u64 usage = sizeof(*smap); /* The dynamically callocated selems are not counted currently. */ usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log); return usage; } /* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. * A deadlock free allocator is useful for storage that the bpf prog can easily * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses * memory immediately. To be reuse-immediate safe, the owner destruction * code path needs to go through a rcu grace period before calling * bpf_local_storage_destroy(). * * When bpf_ma == false, the kmalloc and kfree are used. */ struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, bool bpf_ma) { struct bpf_local_storage_map *smap; unsigned int i; u32 nbuckets; int err; smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); nbuckets = roundup_pow_of_two(num_possible_cpus()); /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN); if (!smap->buckets) { err = -ENOMEM; goto free_smap; } for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); raw_spin_lock_init(&smap->buckets[i].lock); } smap->elem_size = offsetof(struct bpf_local_storage_elem, sdata.data[attr->value_size]); /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non * preemptible context. Thus, enforce all storages to use * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled. */ smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma; if (smap->bpf_ma) { err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); if (err) goto free_smap; err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); if (err) { bpf_mem_alloc_destroy(&smap->selem_ma); goto free_smap; } } smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; free_smap: kvfree(smap->buckets); bpf_map_area_free(smap); return ERR_PTR(err); } void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache, int __percpu *busy_counter) { struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; unsigned int i; smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(cache, smap->cache_idx); /* Note that this map might be concurrently cloned from * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone * RCU read section to finish before proceeding. New RCU * read sections should be prevented via bpf_map_inc_not_zero. */ synchronize_rcu(); /* bpf prog and the userspace can no longer access this map * now. No new selem (of this map) can be added * to the owner->storage or to the map bucket's list. * * The elem of this map can be cleaned up here * or when the storage is freed e.g. * by bpf_sk_storage_free() during __sk_destruct(). */ for (i = 0; i < (1U << smap->bucket_log); i++) { b = &smap->buckets[i]; rcu_read_lock(); /* No one is adding to b->list now */ while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { if (busy_counter) { migrate_disable(); this_cpu_inc(*busy_counter); } bpf_selem_unlink(selem, true); if (busy_counter) { this_cpu_dec(*busy_counter); migrate_enable(); } cond_resched_rcu(); } rcu_read_unlock(); } /* While freeing the storage we may still need to access the map. * * e.g. when bpf_sk_storage_free() has unlinked selem from the map * which then made the above while((selem = ...)) loop * exit immediately. * * However, while freeing the storage one still needs to access the * smap->elem_size to do the uncharging in * bpf_selem_unlink_storage_nolock(). * * Hence, wait another rcu grace period for the storage to be freed. */ synchronize_rcu(); if (smap->bpf_ma) { rcu_barrier_tasks_trace(); if (!rcu_trace_implies_rcu_gp()) rcu_barrier(); bpf_mem_alloc_destroy(&smap->selem_ma); bpf_mem_alloc_destroy(&smap->storage_ma); } kvfree(smap->buckets); bpf_map_area_free(smap); }
4337 4326 1308 4335 295 294 1 1 275 276 294 293 9 293 294 54 54 118 125 125 4190 4180 1073 1106 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H #include <linux/lockdep.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/rwsem.h> #include <linux/tracepoint-defs.h> #include <linux/types.h> #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), DECLARE_TRACEPOINT(mmap_lock_start_locking); DECLARE_TRACEPOINT(mmap_lock_acquire_returned); DECLARE_TRACEPOINT(mmap_lock_released); #ifdef CONFIG_TRACING void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write); void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, bool success); void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write); static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, bool write) { if (tracepoint_enabled(mmap_lock_start_locking)) __mmap_lock_do_trace_start_locking(mm, write); } static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { if (tracepoint_enabled(mmap_lock_acquire_returned)) __mmap_lock_do_trace_acquire_returned(mm, write, success); } static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) { if (tracepoint_enabled(mmap_lock_released)) __mmap_lock_do_trace_released(mm, write); } #else /* !CONFIG_TRACING */ static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, bool write) { } static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { } static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) { } #endif /* CONFIG_TRACING */ static inline void mmap_assert_locked(const struct mm_struct *mm) { rwsem_assert_held(&mm->mmap_lock); } static inline void mmap_assert_write_locked(const struct mm_struct *mm) { rwsem_assert_held_write(&mm->mmap_lock); } #ifdef CONFIG_PER_VMA_LOCK /* * Drop all currently-held per-VMA locks. * This is called from the mmap_lock implementation directly before releasing * a write-locked mmap_lock (or downgrading it to read-locked). * This should normally NOT be called manually from other places. * If you want to call this manually anyway, keep in mind that this will release * *all* VMA write locks, including ones from further up the stack. */ static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); /* * Nobody can concurrently modify mm->mm_lock_seq due to exclusive * mmap_lock being held. * We need RELEASE semantics here to ensure that preceding stores into * the VMA take effect before we unlock it with this store. * Pairs with ACQUIRE semantics in vma_start_read(). */ smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); } #else static inline void vma_end_write_all(struct mm_struct *mm) {} #endif static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); } static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, true, true); } static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); __mmap_lock_trace_acquire_returned(mm, true, true); } static inline int mmap_write_lock_killable(struct mm_struct *mm) { int ret; __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); vma_end_write_all(mm); up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { __mmap_lock_trace_acquire_returned(mm, false, true); vma_end_write_all(mm); downgrade_write(&mm->mmap_lock); } static inline void mmap_read_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, false); down_read(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, true); } static inline int mmap_read_lock_killable(struct mm_struct *mm) { int ret; __mmap_lock_trace_start_locking(mm, false); ret = down_read_killable(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, ret == 0); return ret; } static inline bool mmap_read_trylock(struct mm_struct *mm) { bool ret; __mmap_lock_trace_start_locking(mm, false); ret = down_read_trylock(&mm->mmap_lock) != 0; __mmap_lock_trace_acquire_returned(mm, false, ret); return ret; } static inline void mmap_read_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); up_read(&mm->mmap_lock); } static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); up_read_non_owner(&mm->mmap_lock); } static inline int mmap_lock_is_contended(struct mm_struct *mm) { return rwsem_is_contended(&mm->mmap_lock); } #endif /* _LINUX_MMAP_LOCK_H */
428 424 424 424 424 135 134 135 523 11 523 522 523 523 521 521 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/main.c - Where the driver meets power management. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * * The driver model core calls device_pm_add() when a device is registered. * This will initialize the embedded device_pm_info object in the device * and add it to the list of power-controlled devices. sysfs entries for * controlling device power management will also be added. * * A separate list is used for keeping track of power info, because the power * domain dependencies may differ from the ancestral dependencies that the * subsystem list maintains. */ #define pr_fmt(fmt) "PM: " fmt #define dev_fmt pr_fmt #include <linux/device.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/pm_runtime.h> #include <linux/pm-trace.h> #include <linux/pm_wakeirq.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/async.h> #include <linux/suspend.h> #include <trace/events/power.h> #include <linux/cpufreq.h> #include <linux/devfreq.h> #include <linux/timer.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); #define list_for_each_entry_rcu_locked(pos, head, member) \ list_for_each_entry_rcu(pos, head, member, \ device_links_read_lock_held()) /* * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * * Since device_pm_add() may be called with a device lock held, * we must never try to acquire a device lock while holding * dpm_list_mutex. */ LIST_HEAD(dpm_list); static LIST_HEAD(dpm_prepared_list); static LIST_HEAD(dpm_suspended_list); static LIST_HEAD(dpm_late_early_list); static LIST_HEAD(dpm_noirq_list); static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; static int async_error; static const char *pm_verb(int event) { switch (event) { case PM_EVENT_SUSPEND: return "suspend"; case PM_EVENT_RESUME: return "resume"; case PM_EVENT_FREEZE: return "freeze"; case PM_EVENT_QUIESCE: return "quiesce"; case PM_EVENT_HIBERNATE: return "hibernate"; case PM_EVENT_THAW: return "thaw"; case PM_EVENT_RESTORE: return "restore"; case PM_EVENT_RECOVER: return "recover"; default: return "(unknown PM event)"; } } /** * device_pm_sleep_init - Initialize system suspend-related device fields. * @dev: Device object being initialized. */ void device_pm_sleep_init(struct device *dev) { dev->power.is_prepared = false; dev->power.is_suspended = false; dev->power.is_noirq_suspended = false; dev->power.is_late_suspended = false; init_completion(&dev->power.completion); complete_all(&dev->power.completion); dev->power.wakeup = NULL; INIT_LIST_HEAD(&dev->power.entry); } /** * device_pm_lock - Lock the list of active devices used by the PM core. */ void device_pm_lock(void) { mutex_lock(&dpm_list_mtx); } /** * device_pm_unlock - Unlock the list of active devices used by the PM core. */ void device_pm_unlock(void) { mutex_unlock(&dpm_list_mtx); } /** * device_pm_add - Add a device to the PM core's list of active devices. * @dev: Device to add to the list. */ void device_pm_add(struct device *dev) { /* Skip PM setup/initialization. */ if (device_pm_not_required(dev)) return; pr_debug("Adding info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); device_pm_check_callbacks(dev); mutex_lock(&dpm_list_mtx); if (dev->parent && dev->parent->power.is_prepared) dev_warn(dev, "parent %s should not be sleeping\n", dev_name(dev->parent)); list_add_tail(&dev->power.entry, &dpm_list); dev->power.in_dpm_list = true; mutex_unlock(&dpm_list_mtx); } /** * device_pm_remove - Remove a device from the PM core's list of active devices. * @dev: Device to be removed from the list. */ void device_pm_remove(struct device *dev) { if (device_pm_not_required(dev)) return; pr_debug("Removing info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); complete_all(&dev->power.completion); mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); dev->power.in_dpm_list = false; mutex_unlock(&dpm_list_mtx); device_wakeup_disable(dev); pm_runtime_remove(dev); device_pm_check_callbacks(dev); } /** * device_pm_move_before - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come before. */ void device_pm_move_before(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s before %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert before devb. */ list_move_tail(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_after - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come after. */ void device_pm_move_after(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s after %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert after devb. */ list_move(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_last - Move device to end of the PM core's list of devices. * @dev: Device to move in dpm_list. */ void device_pm_move_last(struct device *dev) { pr_debug("Moving %s:%s to end of list\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); list_move_tail(&dev->power.entry, &dpm_list); } static ktime_t initcall_debug_start(struct device *dev, void *cb) { if (!pm_print_times_enabled) return 0; dev_info(dev, "calling %ps @ %i, parent: %s\n", cb, task_pid_nr(current), dev->parent ? dev_name(dev->parent) : "none"); return ktime_get(); } static void initcall_debug_report(struct device *dev, ktime_t calltime, void *cb, int error) { ktime_t rettime; if (!pm_print_times_enabled) return; rettime = ktime_get(); dev_info(dev, "%ps returned %d after %Ld usecs\n", cb, error, (unsigned long long)ktime_us_delta(rettime, calltime)); } /** * dpm_wait - Wait for a PM operation to complete. * @dev: Device to wait for. * @async: If unset, wait only if the device's power.async_suspend flag is set. */ static void dpm_wait(struct device *dev, bool async) { if (!dev) return; if (async || (pm_async_enabled && dev->power.async_suspend)) wait_for_completion(&dev->power.completion); } static int dpm_wait_fn(struct device *dev, void *async_ptr) { dpm_wait(dev, *((bool *)async_ptr)); return 0; } static void dpm_wait_for_children(struct device *dev, bool async) { device_for_each_child(dev, &async, dpm_wait_fn); } static void dpm_wait_for_suppliers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * If the supplier goes away right after we've checked the link to it, * we'll wait for its completion to change the state, but that's fine, * because the only things that will block as a result are the SRCU * callbacks freeing the link objects for the links in the list we're * walking. */ list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->supplier, async); device_links_read_unlock(idx); } static bool dpm_wait_for_superior(struct device *dev, bool async) { struct device *parent; /* * If the device is resumed asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by reference * counting the parent once more unless the device has been deleted * already (in which case return right away). */ mutex_lock(&dpm_list_mtx); if (!device_pm_initialized(dev)) { mutex_unlock(&dpm_list_mtx); return false; } parent = get_device(dev->parent); mutex_unlock(&dpm_list_mtx); dpm_wait(parent, async); put_device(parent); dpm_wait_for_suppliers(dev, async); /* * If the parent's callback has deleted the device, attempting to resume * it would be invalid, so avoid doing that then. */ return device_pm_initialized(dev); } static void dpm_wait_for_consumers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * The status of a device link can only be changed from "dormant" by a * probe, but that cannot happen during system suspend/resume. In * theory it can change to "dormant" at that time, but then it is * reasonable to wait for the target device anyway (eg. if it goes * away, it's better to wait for it to go away completely and then * continue instead of trying to continue in parallel with its * unregistration). */ list_for_each_entry_rcu_locked(link, &dev->links.consumers, s_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->consumer, async); device_links_read_unlock(idx); } static void dpm_wait_for_subordinate(struct device *dev, bool async) { dpm_wait_for_children(dev, async); dpm_wait_for_consumers(dev, async); } /** * pm_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend; case PM_EVENT_RESUME: return ops->resume; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw; case PM_EVENT_RESTORE: return ops->restore; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_late_early_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * Runtime PM is disabled for @dev while this function is being executed. */ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_late; case PM_EVENT_RESUME: return ops->resume_early; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_early; case PM_EVENT_RESTORE: return ops->restore_early; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_noirq; case PM_EVENT_RESUME: return ops->resume_noirq; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_noirq; case PM_EVENT_RESTORE: return ops->restore_noirq; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info) { dev_dbg(dev, "%s%s%s driver flags: %x\n", info, pm_verb(state.event), ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ? ", may wakeup" : "", dev->power.driver_flags); } static void pm_dev_err(struct device *dev, pm_message_t state, const char *info, int error) { dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info, error); } static void dpm_show_time(ktime_t starttime, pm_message_t state, int error, const char *info) { ktime_t calltime; u64 usecs64; int usecs; calltime = ktime_get(); usecs64 = ktime_to_ns(ktime_sub(calltime, starttime)); do_div(usecs64, NSEC_PER_USEC); usecs = usecs64; if (usecs == 0) usecs = 1; pm_pr_dbg("%s%s%s of devices %s after %ld.%03ld msecs\n", info ?: "", info ? " " : "", pm_verb(state.event), error ? "aborted" : "complete", usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC); } static int dpm_run_callback(pm_callback_t cb, struct device *dev, pm_message_t state, const char *info) { ktime_t calltime; int error; if (!cb) return 0; calltime = initcall_debug_start(dev, cb); pm_dev_dbg(dev, state, info); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } #ifdef CONFIG_DPM_WATCHDOG struct dpm_watchdog { struct device *dev; struct task_struct *tsk; struct timer_list timer; }; #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \ struct dpm_watchdog wd /** * dpm_watchdog_handler - Driver suspend / resume watchdog handler. * @t: The timer that PM watchdog depends on. * * Called when a driver has timed out suspending or resuming. * There's not much we can do here to recover so panic() to * capture a crash-dump in pstore. */ static void dpm_watchdog_handler(struct timer_list *t) { struct dpm_watchdog *wd = from_timer(wd, t, timer); dev_emerg(wd->dev, "**** DPM device timeout ****\n"); show_stack(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } /** * dpm_watchdog_set - Enable pm watchdog for given device. * @wd: Watchdog. Must be allocated on the stack. * @dev: Device to handle. */ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) { struct timer_list *timer = &wd->timer; wd->dev = dev; wd->tsk = current; timer_setup_on_stack(timer, dpm_watchdog_handler, 0); /* use same timeout value for both suspend and resume */ timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT; add_timer(timer); } /** * dpm_watchdog_clear - Disable suspend/resume watchdog. * @wd: Watchdog to disable. */ static void dpm_watchdog_clear(struct dpm_watchdog *wd) { struct timer_list *timer = &wd->timer; del_timer_sync(timer); destroy_timer_on_stack(timer); } #else #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) #define dpm_watchdog_set(x, y) #define dpm_watchdog_clear(x) #endif /*------------------------- Resume routines -------------------------*/ /** * dev_pm_skip_resume - System-wide device resume optimization check. * @dev: Target device. * * Return: * - %false if the transition under way is RESTORE. * - Return value of dev_pm_skip_suspend() if the transition under way is THAW. * - The logical negation of %power.must_resume otherwise (that is, when the * transition under way is RESUME). */ bool dev_pm_skip_resume(struct device *dev) { if (pm_transition.event == PM_EVENT_RESTORE) return false; if (pm_transition.event == PM_EVENT_THAW) return dev_pm_skip_suspend(dev); return !dev->power.must_resume; } static bool is_async(struct device *dev) { return dev->power.async_suspend && pm_async_enabled && !pm_trace_is_enabled(); } static bool dpm_async_fn(struct device *dev, async_func_t func) { reinit_completion(&dev->power.completion); if (is_async(dev)) { dev->power.async_in_progress = true; get_device(dev); if (async_schedule_dev_nocall(func, dev)) return true; put_device(dev); } /* * Because async_schedule_dev_nocall() above has returned false or it * has not been called at all, func() is not running and it is safe to * update the async_in_progress flag without extra synchronization. */ dev->power.async_in_progress = false; return false; } /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_resume_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; bool skip_resume; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_noirq_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; skip_resume = dev_pm_skip_resume(dev); /* * If the driver callback is skipped below or by the middle layer * callback and device_resume_early() also skips the driver callback for * this device later, it needs to appear as "suspended" to PM-runtime, * so change its status accordingly. * * Otherwise, the device is going to be resumed, so set its PM-runtime * status to "active", but do that only if DPM_FLAG_SMART_SUSPEND is set * to avoid confusing drivers that don't use it. */ if (skip_resume) pm_runtime_set_suspended(dev); else if (dev_pm_skip_suspend(dev)) pm_runtime_set_active(dev); if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (skip_resume) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_noirq_suspended = false; Out: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } } static void async_resume_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_noirq(dev, pm_transition, true); put_device(dev); } static void dpm_noirq_resume_devices(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_noirq_list, power.entry) dpm_async_fn(dev, async_resume_noirq); while (!list_empty(&dpm_noirq_list)) { dev = to_device(dpm_noirq_list.next); list_move_tail(&dev->power.entry, &dpm_late_early_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "noirq"); if (async_error) dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * * Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and * allow device drivers' interrupt handlers to be called. */ void dpm_resume_noirq(pm_message_t state) { dpm_noirq_resume_devices(state); resume_device_irqs(); device_wakeup_disarm_wake_irqs(); } /** * device_resume_early - Execute an "early resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_resume_early(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; if (dev->pm_domain) { info = "early power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "early type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "early class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "early bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_resume(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "early driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_late_suspended = false; Out: TRACE_RESUME(error); pm_runtime_enable(dev); complete_all(&dev->power.completion); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); } } static void async_resume_early(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_early(dev, pm_transition, true); put_device(dev); } /** * dpm_resume_early - Execute "early resume" callbacks for all devices. * @state: PM transition of the system being carried out. */ void dpm_resume_early(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_late_early_list, power.entry) dpm_async_fn(dev, async_resume_early); while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.next); list_move_tail(&dev->power.entry, &dpm_suspended_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_early(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "early"); if (async_error) dpm_save_failed_step(SUSPEND_RESUME_EARLY); trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** * dpm_resume_start - Execute "noirq" and "early" device callbacks. * @state: PM transition of the system being carried out. */ void dpm_resume_start(pm_message_t state) { dpm_resume_noirq(state); dpm_resume_early(state); } EXPORT_SYMBOL_GPL(dpm_resume_start); /** * device_resume - Execute "resume" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. */ static void device_resume(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore) goto Complete; if (dev->power.direct_complete) { /* Match the pm_runtime_disable() in __device_suspend(). */ pm_runtime_enable(dev); goto Complete; } if (!dpm_wait_for_superior(dev, async)) goto Complete; dpm_watchdog_set(&wd, dev); device_lock(dev); /* * This is a fib. But we'll allow new children to be added below * a resumed device, even if the device hasn't been completed yet. */ dev->power.is_prepared = false; if (!dev->power.is_suspended) goto Unlock; if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Driver; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Driver; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Driver; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->resume) { info = "legacy bus "; callback = dev->bus->resume; goto End; } } Driver: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } End: error = dpm_run_callback(callback, dev, state, info); dev->power.is_suspended = false; Unlock: device_unlock(dev); dpm_watchdog_clear(&wd); Complete: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } } static void async_resume(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume(dev, pm_transition, true); put_device(dev); } /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the appropriate "resume" callback for all devices whose status * indicates that they are suspended. */ void dpm_resume(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume"), state.event, true); might_sleep(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_suspended_list, power.entry) dpm_async_fn(dev, async_resume); while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); list_move_tail(&dev->power.entry, &dpm_prepared_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, NULL); if (async_error) dpm_save_failed_step(SUSPEND_RESUME); cpufreq_resume(); devfreq_resume(); trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** * device_complete - Complete a PM transition for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. */ static void device_complete(struct device *dev, pm_message_t state) { void (*callback)(struct device *) = NULL; const char *info = NULL; if (dev->power.syscore) goto out; device_lock(dev); if (dev->pm_domain) { info = "completing power domain "; callback = dev->pm_domain->ops.complete; } else if (dev->type && dev->type->pm) { info = "completing type "; callback = dev->type->pm->complete; } else if (dev->class && dev->class->pm) { info = "completing class "; callback = dev->class->pm->complete; } else if (dev->bus && dev->bus->pm) { info = "completing bus "; callback = dev->bus->pm->complete; } if (!callback && dev->driver && dev->driver->pm) { info = "completing driver "; callback = dev->driver->pm->complete; } if (callback) { pm_dev_dbg(dev, state, info); callback(dev); } device_unlock(dev); out: pm_runtime_put(dev); } /** * dpm_complete - Complete a PM transition for all non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the ->complete() callbacks for all devices whose PM status is not * DPM_ON (this allows new devices to be registered). */ void dpm_complete(pm_message_t state) { struct list_head list; trace_suspend_resume(TPS("dpm_complete"), state.event, true); might_sleep(); INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); get_device(dev); dev->power.is_prepared = false; list_move(&dev->power.entry, &list); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); device_complete(dev, state); trace_device_pm_callback_end(dev, 0); put_device(dev); mutex_lock(&dpm_list_mtx); } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); /* Allow device probing and trigger re-probing of deferred devices */ device_unblock_probing(); trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** * dpm_resume_end - Execute "resume" callbacks and complete system transition. * @state: PM transition of the system being carried out. * * Execute "resume" callbacks for all devices and complete the PM transition of * the system. */ void dpm_resume_end(pm_message_t state) { dpm_resume(state); dpm_complete(state); } EXPORT_SYMBOL_GPL(dpm_resume_end); /*------------------------- Suspend routines -------------------------*/ /** * resume_event - Return a "resume" message for given "suspend" sleep state. * @sleep_state: PM message representing a sleep state. * * Return a PM message representing the resume event corresponding to given * sleep state. */ static pm_message_t resume_event(pm_message_t sleep_state) { switch (sleep_state.event) { case PM_EVENT_SUSPEND: return PMSG_RESUME; case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return PMSG_RECOVER; case PM_EVENT_HIBERNATE: return PMSG_RESTORE; } return PMSG_ON; } static void dpm_superior_set_must_resume(struct device *dev) { struct device_link *link; int idx; if (dev->parent) dev->parent->power.must_resume = true; idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) link->supplier->power.must_resume = true; device_links_read_unlock(idx); } /** * device_suspend_noirq - Execute a "noirq suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (async_error) goto Complete; if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); goto Complete; } Skip: dev->power.is_noirq_suspended = true; /* * Skipping the resume of devices that were in use right before the * system suspend (as indicated by their PM-runtime usage counters) * would be suboptimal. Also resume them if doing that is not allowed * to be skipped. */ if (atomic_read(&dev->power.usage_count) > 1 || !(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && dev->power.may_skip_resume)) dev->power.must_resume = true; if (dev->power.must_resume) dpm_superior_set_must_resume(dev); Complete: complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } static void async_suspend_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_noirq(dev, pm_transition, true); put_device(dev); } static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_late_early_list)) { struct device *dev = to_device(dpm_late_early_list.prev); list_move(&dev->power.entry, &dpm_noirq_list); if (dpm_async_fn(dev, async_suspend_noirq)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_show_time(starttime, state, error, "noirq"); trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } /** * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers' interrupt handlers from being called and invoke * "noirq" suspend callbacks for all non-sysdev devices. */ int dpm_suspend_noirq(pm_message_t state) { int ret; device_wakeup_arm_wake_irqs(); suspend_device_irqs(); ret = dpm_noirq_suspend_devices(state); if (ret) dpm_resume_noirq(resume_event(state)); return ret; } static void dpm_propagate_wakeup_to_parent(struct device *dev) { struct device *parent = dev->parent; if (!parent) return; spin_lock_irq(&parent->power.lock); if (device_wakeup_path(dev) && !parent->power.ignore_children) parent->power.wakeup_path = true; spin_unlock_irq(&parent->power.lock); } /** * device_suspend_late - Execute a "late suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static int device_suspend_late(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); __pm_runtime_disable(dev, false); dpm_wait_for_subordinate(dev, async); if (async_error) goto Complete; if (pm_wakeup_pending()) { async_error = -EBUSY; goto Complete; } if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "late power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "late type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "late class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "late bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "late driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); goto Complete; } dpm_propagate_wakeup_to_parent(dev); Skip: dev->power.is_late_suspended = true; Complete: TRACE_SUSPEND(error); complete_all(&dev->power.completion); return error; } static void async_suspend_late(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_late(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend_late - Execute "late suspend" callbacks for all devices. * @state: PM transition of the system being carried out. */ int dpm_suspend_late(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); pm_transition = state; async_error = 0; wake_up_all_idle_cpus(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_suspended_list)) { struct device *dev = to_device(dpm_suspended_list.prev); list_move(&dev->power.entry, &dpm_late_early_list); if (dpm_async_fn(dev, async_suspend_late)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_late(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) { dpm_save_failed_step(SUSPEND_SUSPEND_LATE); dpm_resume_early(resume_event(state)); } dpm_show_time(starttime, state, error, "late"); trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } /** * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. * @state: PM transition of the system being carried out. */ int dpm_suspend_end(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_suspend_late(state); if (error) goto out; error = dpm_suspend_noirq(state); if (error) dpm_resume_early(resume_event(state)); out: dpm_show_time(starttime, state, error, "end"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. * @dev: Device to suspend. * @state: PM transition of the system being carried out. * @cb: Suspend callback to execute. * @info: string description of caller. */ static int legacy_suspend(struct device *dev, pm_message_t state, int (*cb)(struct device *dev, pm_message_t state), const char *info) { int error; ktime_t calltime; calltime = initcall_debug_start(dev, cb); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } static void dpm_clear_superiors_direct_complete(struct device *dev) { struct device_link *link; int idx; if (dev->parent) { spin_lock_irq(&dev->parent->power.lock); dev->parent->power.direct_complete = false; spin_unlock_irq(&dev->parent->power.lock); } idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { spin_lock_irq(&link->supplier->power.lock); link->supplier->power.direct_complete = false; spin_unlock_irq(&link->supplier->power.lock); } device_links_read_unlock(idx); } /** * device_suspend - Execute "suspend" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. */ static int device_suspend(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (async_error) { dev->power.direct_complete = false; goto Complete; } /* * Wait for possible runtime PM transitions of the device in progress * to complete and if there's a runtime resume request pending for it, * resume it before proceeding with invoking the system-wide suspend * callbacks for it. * * If the system-wide suspend callbacks below change the configuration * of the device, they must disable runtime PM for it or otherwise * ensure that its runtime-resume callbacks will not be confused by that * change in case they are invoked going forward. */ pm_runtime_barrier(dev); if (pm_wakeup_pending()) { dev->power.direct_complete = false; async_error = -EBUSY; goto Complete; } if (dev->power.syscore) goto Complete; /* Avoid direct_complete to let wakeup_path propagate. */ if (device_may_wakeup(dev) || device_wakeup_path(dev)) dev->power.direct_complete = false; if (dev->power.direct_complete) { if (pm_runtime_status_suspended(dev)) { pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) { pm_dev_dbg(dev, state, "direct-complete "); goto Complete; } pm_runtime_enable(dev); } dev->power.direct_complete = false; } dev->power.may_skip_resume = true; dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME); dpm_watchdog_set(&wd, dev); device_lock(dev); if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Run; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Run; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Run; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->suspend) { pm_dev_dbg(dev, state, "legacy bus "); error = legacy_suspend(dev, state, dev->bus->suspend, "legacy bus "); goto End; } } Run: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } error = dpm_run_callback(callback, dev, state, info); End: if (!error) { dev->power.is_suspended = true; if (device_may_wakeup(dev)) dev->power.wakeup_path = true; dpm_propagate_wakeup_to_parent(dev); dpm_clear_superiors_direct_complete(dev); } device_unlock(dev); dpm_watchdog_clear(&wd); Complete: if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } static void async_suspend(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices. * @state: PM transition of the system being carried out. */ int dpm_suspend(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); devfreq_suspend(); cpufreq_suspend(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); list_move(&dev->power.entry, &dpm_suspended_list); if (dpm_async_fn(dev, async_suspend)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) dpm_save_failed_step(SUSPEND_SUSPEND); dpm_show_time(starttime, state, error, NULL); trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } /** * device_prepare - Prepare a device for system power transition. * @dev: Device to handle. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for given device. No new children of the * device may be registered after this function has returned. */ static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; int ret = 0; /* * If a device's parent goes into runtime suspend at the wrong time, * it won't be possible to resume the device. To prevent this we * block runtime suspend here, during the prepare phase, and allow * it again during the complete phase. */ pm_runtime_get_noresume(dev); if (dev->power.syscore) return 0; device_lock(dev); dev->power.wakeup_path = false; if (dev->power.no_pm_callbacks) goto unlock; if (dev->pm_domain) callback = dev->pm_domain->ops.prepare; else if (dev->type && dev->type->pm) callback = dev->type->pm->prepare; else if (dev->class && dev->class->pm) callback = dev->class->pm->prepare; else if (dev->bus && dev->bus->pm) callback = dev->bus->pm->prepare; if (!callback && dev->driver && dev->driver->pm) callback = dev->driver->pm->prepare; if (callback) ret = callback(dev); unlock: device_unlock(dev); if (ret < 0) { suspend_report_result(dev, callback, ret); pm_runtime_put(dev); return ret; } /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is * runtime-suspended, you can leave it in that state provided that you * will do the same thing with all of its descendants". This only * applies to suspend transitions, however. */ spin_lock_irq(&dev->power.lock); dev->power.direct_complete = state.event == PM_EVENT_SUSPEND && (ret > 0 || dev->power.no_pm_callbacks) && !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); spin_unlock_irq(&dev->power.lock); return 0; } /** * dpm_prepare - Prepare all non-sysdev devices for a system PM transition. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for all devices. */ int dpm_prepare(pm_message_t state) { int error = 0; trace_suspend_resume(TPS("dpm_prepare"), state.event, true); might_sleep(); /* * Give a chance for the known devices to complete their probes, before * disable probing of devices. This sync point is important at least * at boot time + hibernation restore. */ wait_for_device_probe(); /* * It is unsafe if probing of devices will happen during suspend or * hibernation and system behavior will be unpredictable in this case. * So, let's prohibit device's probing here and defer their probes * instead. The normal behavior will be restored in dpm_complete(). */ device_block_probing(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_list) && !error) { struct device *dev = to_device(dpm_list.next); get_device(dev); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); error = device_prepare(dev, state); trace_device_pm_callback_end(dev, error); mutex_lock(&dpm_list_mtx); if (!error) { dev->power.is_prepared = true; if (!list_empty(&dev->power.entry)) list_move_tail(&dev->power.entry, &dpm_prepared_list); } else if (error == -EAGAIN) { error = 0; } else { dev_info(dev, "not prepared for power transition: code %d\n", error); } mutex_unlock(&dpm_list_mtx); put_device(dev); mutex_lock(&dpm_list_mtx); } mutex_unlock(&dpm_list_mtx); trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } /** * dpm_suspend_start - Prepare devices for PM transition and suspend them. * @state: PM transition of the system being carried out. * * Prepare all non-sysdev devices for system PM transition and execute "suspend" * callbacks for them. */ int dpm_suspend_start(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_prepare(state); if (error) dpm_save_failed_step(SUSPEND_PREPARE); else error = dpm_suspend(state); dpm_show_time(starttime, state, error, "start"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) { if (ret) dev_err(dev, "%s(): %ps returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); /** * device_pm_wait_for_dev - Wait for suspend/resume of a device to complete. * @subordinate: Device that needs to wait for @dev. * @dev: Device to wait for. */ int device_pm_wait_for_dev(struct device *subordinate, struct device *dev) { dpm_wait(dev, subordinate->power.async_suspend); return async_error; } EXPORT_SYMBOL_GPL(device_pm_wait_for_dev); /** * dpm_for_each_dev - device iterator. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over devices in dpm_list, and call @fn for each device, * passing it @data. */ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *)) { struct device *dev; if (!fn) return; device_pm_lock(); list_for_each_entry(dev, &dpm_list, power.entry) fn(dev, data); device_pm_unlock(); } EXPORT_SYMBOL_GPL(dpm_for_each_dev); static bool pm_ops_is_empty(const struct dev_pm_ops *ops) { if (!ops) return true; return !ops->prepare && !ops->suspend && !ops->suspend_late && !ops->suspend_noirq && !ops->resume_noirq && !ops->resume_early && !ops->resume && !ops->complete; } void device_pm_check_callbacks(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); dev->power.no_pm_callbacks = (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && !dev->bus->suspend && !dev->bus->resume)) && (!dev->class || pm_ops_is_empty(dev->class->pm)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irqrestore(&dev->power.lock, flags); } bool dev_pm_skip_suspend(struct device *dev) { return dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) && pm_runtime_status_suspended(dev); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CPUSET_H #define _LINUX_CPUSET_H /* * cpuset interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/sched/topology.h> #include <linux/sched/task.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/jump_label.h> #ifdef CONFIG_CPUSETS /* * Static branch rewrites can happen in an arbitrary order for a given * key. In code paths where we need to loop with read_mems_allowed_begin() and * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need * to ensure that begin() always gets rewritten before retry() in the * disabled -> enabled transition. If not, then if local irqs are disabled * around the loop, we can deadlock since retry() would always be * comparing the latest value of the mems_allowed seqcount against 0 as * begin() still would see cpusets_enabled() as false. The enabled -> disabled * transition should happen in reverse order for the same reasons (want to stop * looking at real value of mems_allowed.sequence in retry() first). */ extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; extern struct static_key_false cpusets_insane_config_key; static inline bool cpusets_enabled(void) { return static_branch_unlikely(&cpusets_enabled_key); } static inline void cpuset_inc(void) { static_branch_inc_cpuslocked(&cpusets_pre_enable_key); static_branch_inc_cpuslocked(&cpusets_enabled_key); } static inline void cpuset_dec(void) { static_branch_dec_cpuslocked(&cpusets_enabled_key); static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } /* * This will get enabled whenever a cpuset configuration is considered * unsupportable in general. E.g. movable only node which cannot satisfy * any non movable allocations (see update_nodemask). Page allocator * needs to make additional checks for those configurations and this * check is meant to guard those checks without any overhead for sane * configurations. */ static inline bool cpusets_insane_config(void) { return static_branch_unlikely(&cpusets_insane_config_key); } extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void inc_dl_tasks_cs(struct task_struct *task); extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern bool cpuset_cpu_is_isolated(int cpu); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); extern bool cpuset_node_allowed(int node, gfp_t gfp_mask); static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return cpuset_node_allowed(zone_to_nid(z), gfp_mask); } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { if (cpusets_enabled()) return __cpuset_zone_allowed(z, gfp_mask); return true; } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2); #ifdef CONFIG_CPUSETS_V1 #define cpuset_memory_pressure_bump() \ do { \ if (cpuset_memory_pressure_enabled) \ __cpuset_memory_pressure_bump(); \ } while (0) extern int cpuset_memory_pressure_enabled; extern void __cpuset_memory_pressure_bump(void); #else static inline void cpuset_memory_pressure_bump(void) { } #endif extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); extern int cpuset_mem_spread_node(void); static inline int cpuset_do_page_mem_spread(void) { return task_spread_page(current); } extern bool current_cpuset_is_being_rebound(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); /* * read_mems_allowed_begin is required when making decisions involving * mems_allowed such as during page allocation. mems_allowed can be updated in * parallel and depending on the new value an operation can fail potentially * causing process failure. A retry loop with read_mems_allowed_begin and * read_mems_allowed_retry prevents these artificial failures. */ static inline unsigned int read_mems_allowed_begin(void) { if (!static_branch_unlikely(&cpusets_pre_enable_key)) return 0; return read_seqcount_begin(&current->mems_allowed_seq); } /* * If this returns true, the operation that took place after * read_mems_allowed_begin may have failed artificially due to a concurrent * update of mems_allowed. It is up to the caller to retry the operation if * appropriate. */ static inline bool read_mems_allowed_retry(unsigned int seq) { if (!static_branch_unlikely(&cpusets_enabled_key)) return false; return read_seqcount_retry(&current->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) { unsigned long flags; task_lock(current); local_irq_save(flags); write_seqcount_begin(&current->mems_allowed_seq); current->mems_allowed = nodemask; write_seqcount_end(&current->mems_allowed_seq); local_irq_restore(flags); task_unlock(current); } #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } static inline bool cpusets_insane_config(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} static inline void cpuset_force_rebuild(void) { } static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } static inline void inc_dl_tasks_cs(struct task_struct *task) { } static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } static inline void cpuset_unlock(void) { } static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { cpumask_copy(mask, task_cpu_possible_mask(p)); } static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) { return false; } static inline bool cpuset_cpu_is_isolated(int cpu) { return false; } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; } #define cpuset_current_mems_allowed (node_states[N_MEMORY]) static inline void cpuset_init_current_mems_allowed(void) {} static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { return 1; } static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return true; } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return true; } static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2) { return 1; } static inline void cpuset_memory_pressure_bump(void) {} static inline void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { } static inline int cpuset_mem_spread_node(void) { return 0; } static inline int cpuset_do_page_mem_spread(void) { return 0; } static inline bool current_cpuset_is_being_rebound(void) { return false; } static inline void rebuild_sched_domains(void) { partition_sched_domains(1, NULL, NULL); } static inline void cpuset_print_current_mems_allowed(void) { } static inline void set_mems_allowed(nodemask_t nodemask) { } static inline unsigned int read_mems_allowed_begin(void) { return 0; } static inline bool read_mems_allowed_retry(unsigned int seq) { return false; } #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */
13 13 44 44 44 44 44 44 13 13 13 45 45 4 44 43 4 35 3 3 1 1 1 29 45 46 44 44 44 10 11 2 2 3 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 // SPDX-License-Identifier: GPL-2.0-only /* * File: socket.c * * Phonet sockets * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont */ #include <linux/gfp.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/poll.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/phonet.h> #include <linux/export.h> #include <net/phonet/phonet.h> #include <net/phonet/pep.h> #include <net/phonet/pn_dev.h> static int pn_socket_release(struct socket *sock) { struct sock *sk = sock->sk; if (sk) { sock->sk = NULL; sk->sk_prot->close(sk, 0); } return 0; } #define PN_HASHSIZE 16 #define PN_HASHMASK (PN_HASHSIZE-1) static struct { struct hlist_head hlist[PN_HASHSIZE]; struct mutex lock; } pnsocks; void __init pn_sock_init(void) { unsigned int i; for (i = 0; i < PN_HASHSIZE; i++) INIT_HLIST_HEAD(pnsocks.hlist + i); mutex_init(&pnsocks.lock); } static struct hlist_head *pn_hash_list(u16 obj) { return pnsocks.hlist + (obj & PN_HASHMASK); } /* * Find address based on socket address, match only certain fields. * Also grab sock if it was found. Remember to sock_put it later. */ struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *spn) { struct sock *sknode; struct sock *rval = NULL; u16 obj = pn_sockaddr_get_object(spn); u8 res = spn->spn_resource; struct hlist_head *hlist = pn_hash_list(obj); rcu_read_lock(); sk_for_each_rcu(sknode, hlist) { struct pn_sock *pn = pn_sk(sknode); BUG_ON(!pn->sobject); /* unbound socket */ if (!net_eq(sock_net(sknode), net)) continue; if (pn_port(obj)) { /* Look up socket by port */ if (pn_port(pn->sobject) != pn_port(obj)) continue; } else { /* If port is zero, look up by resource */ if (pn->resource != res) continue; } if (pn_addr(pn->sobject) && pn_addr(pn->sobject) != pn_addr(obj)) continue; rval = sknode; sock_hold(sknode); break; } rcu_read_unlock(); return rval; } /* Deliver a broadcast packet (only in bottom-half) */ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb) { struct hlist_head *hlist = pnsocks.hlist; unsigned int h; rcu_read_lock(); for (h = 0; h < PN_HASHSIZE; h++) { struct sock *sknode; sk_for_each(sknode, hlist) { struct sk_buff *clone; if (!net_eq(sock_net(sknode), net)) continue; if (!sock_flag(sknode, SOCK_BROADCAST)) continue; clone = skb_clone(skb, GFP_ATOMIC); if (clone) { sock_hold(sknode); sk_receive_skb(sknode, clone, 0); } } hlist++; } rcu_read_unlock(); } int pn_sock_hash(struct sock *sk) { struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject); mutex_lock(&pnsocks.lock); sk_add_node_rcu(sk, hlist); mutex_unlock(&pnsocks.lock); return 0; } EXPORT_SYMBOL(pn_sock_hash); void pn_sock_unhash(struct sock *sk) { mutex_lock(&pnsocks.lock); sk_del_node_init_rcu(sk); mutex_unlock(&pnsocks.lock); pn_sock_unbind_all_res(sk); synchronize_rcu(); } EXPORT_SYMBOL(pn_sock_unhash); static DEFINE_MUTEX(port_mutex); static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); struct sockaddr_pn *spn = (struct sockaddr_pn *)addr; int err; u16 handle; u8 saddr; if (sk->sk_prot->bind) return sk->sk_prot->bind(sk, addr, len); if (len < sizeof(struct sockaddr_pn)) return -EINVAL; if (spn->spn_family != AF_PHONET) return -EAFNOSUPPORT; handle = pn_sockaddr_get_object((struct sockaddr_pn *)addr); saddr = pn_addr(handle); if (saddr && phonet_address_lookup(sock_net(sk), saddr)) return -EADDRNOTAVAIL; lock_sock(sk); if (sk->sk_state != TCP_CLOSE || pn_port(pn->sobject)) { err = -EINVAL; /* attempt to rebind */ goto out; } WARN_ON(sk_hashed(sk)); mutex_lock(&port_mutex); err = sk->sk_prot->get_port(sk, pn_port(handle)); if (err) goto out_port; /* get_port() sets the port, bind() sets the address if applicable */ pn->sobject = pn_object(saddr, pn_port(pn->sobject)); pn->resource = spn->spn_resource; /* Enable RX on the socket */ err = sk->sk_prot->hash(sk); out_port: mutex_unlock(&port_mutex); out: release_sock(sk); return err; } static int pn_socket_autobind(struct socket *sock) { struct sockaddr_pn sa; int err; memset(&sa, 0, sizeof(sa)); sa.spn_family = AF_PHONET; err = pn_socket_bind(sock, (struct sockaddr *)&sa, sizeof(struct sockaddr_pn)); if (err != -EINVAL) return err; BUG_ON(!pn_port(pn_sk(sock->sk)->sobject)); return 0; /* socket was already bound */ } static int pn_socket_connect(struct socket *sock, struct sockaddr *addr, int len, int flags) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); struct sockaddr_pn *spn = (struct sockaddr_pn *)addr; struct task_struct *tsk = current; long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); int err; if (pn_socket_autobind(sock)) return -ENOBUFS; if (len < sizeof(struct sockaddr_pn)) return -EINVAL; if (spn->spn_family != AF_PHONET) return -EAFNOSUPPORT; lock_sock(sk); switch (sock->state) { case SS_UNCONNECTED: if (sk->sk_state != TCP_CLOSE) { err = -EISCONN; goto out; } break; case SS_CONNECTING: err = -EALREADY; goto out; default: err = -EISCONN; goto out; } pn->dobject = pn_sockaddr_get_object(spn); pn->resource = pn_sockaddr_get_resource(spn); sock->state = SS_CONNECTING; err = sk->sk_prot->connect(sk, addr, len); if (err) { sock->state = SS_UNCONNECTED; pn->dobject = 0; goto out; } while (sk->sk_state == TCP_SYN_SENT) { DEFINE_WAIT(wait); if (!timeo) { err = -EINPROGRESS; goto out; } if (signal_pending(tsk)) { err = sock_intr_errno(timeo); goto out; } prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); finish_wait(sk_sleep(sk), &wait); } if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) err = 0; else if (sk->sk_state == TCP_CLOSE_WAIT) err = -ECONNRESET; else err = -ECONNREFUSED; sock->state = err ? SS_UNCONNECTED : SS_CONNECTED; out: release_sock(sk); return err; } static int pn_socket_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; if (unlikely(sk->sk_state != TCP_LISTEN)) return -EINVAL; newsk = sk->sk_prot->accept(sk, arg); if (!newsk) return arg->err; lock_sock(newsk); sock_graft(newsk, newsock); newsock->state = SS_CONNECTED; release_sock(newsk); return 0; } static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); memset(addr, 0, sizeof(struct sockaddr_pn)); addr->sa_family = AF_PHONET; if (!peer) /* Race with bind() here is userland's problem. */ pn_sockaddr_set_object((struct sockaddr_pn *)addr, pn->sobject); return sizeof(struct sockaddr_pn); } static __poll_t pn_socket_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct pep_sock *pn = pep_sk(sk); __poll_t mask = 0; poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == TCP_CLOSE) return EPOLLERR; if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (!skb_queue_empty_lockless(&pn->ctrlreq_queue)) mask |= EPOLLPRI; if (!mask && sk->sk_state == TCP_CLOSE_WAIT) return EPOLLHUP; if (sk->sk_state == TCP_ESTABLISHED && refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf && atomic_read(&pn->tx_credits)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } static int pn_socket_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); if (cmd == SIOCPNGETOBJECT) { struct net_device *dev; u16 handle; u8 saddr; if (get_user(handle, (__u16 __user *)arg)) return -EFAULT; lock_sock(sk); if (sk->sk_bound_dev_if) dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if); else dev = phonet_device_get(sock_net(sk)); if (dev && (dev->flags & IFF_UP)) saddr = phonet_address_get(dev, pn_addr(handle)); else saddr = PN_NO_ADDR; release_sock(sk); dev_put(dev); if (saddr == PN_NO_ADDR) return -EHOSTUNREACH; handle = pn_object(saddr, pn_port(pn->sobject)); return put_user(handle, (__u16 __user *)arg); } return sk_ioctl(sk, cmd, (void __user *)arg); } static int pn_socket_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int err = 0; if (pn_socket_autobind(sock)) return -ENOBUFS; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { err = -EINVAL; goto out; } if (sk->sk_state != TCP_LISTEN) { sk->sk_state = TCP_LISTEN; sk->sk_ack_backlog = 0; } sk->sk_max_ack_backlog = backlog; out: release_sock(sk); return err; } static int pn_socket_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct sock *sk = sock->sk; if (pn_socket_autobind(sock)) return -EAGAIN; return sk->sk_prot->sendmsg(sk, m, total_len); } const struct proto_ops phonet_dgram_ops = { .family = AF_PHONET, .owner = THIS_MODULE, .release = pn_socket_release, .bind = pn_socket_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pn_socket_getname, .poll = datagram_poll, .ioctl = pn_socket_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; const struct proto_ops phonet_stream_ops = { .family = AF_PHONET, .owner = THIS_MODULE, .release = pn_socket_release, .bind = pn_socket_bind, .connect = pn_socket_connect, .socketpair = sock_no_socketpair, .accept = pn_socket_accept, .getname = pn_socket_getname, .poll = pn_socket_poll, .ioctl = pn_socket_ioctl, .listen = pn_socket_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; EXPORT_SYMBOL(phonet_stream_ops); /* allocate port for a socket */ int pn_sock_get_port(struct sock *sk, unsigned short sport) { static int port_cur; struct net *net = sock_net(sk); struct pn_sock *pn = pn_sk(sk); struct sockaddr_pn try_sa; struct sock *tmpsk; memset(&try_sa, 0, sizeof(struct sockaddr_pn)); try_sa.spn_family = AF_PHONET; WARN_ON(!mutex_is_locked(&port_mutex)); if (!sport) { /* search free port */ int port, pmin, pmax; phonet_get_local_port_range(&pmin, &pmax); for (port = pmin; port <= pmax; port++) { port_cur++; if (port_cur < pmin || port_cur > pmax) port_cur = pmin; pn_sockaddr_set_port(&try_sa, port_cur); tmpsk = pn_find_sock_by_sa(net, &try_sa); if (tmpsk == NULL) { sport = port_cur; goto found; } else sock_put(tmpsk); } } else { /* try to find specific port */ pn_sockaddr_set_port(&try_sa, sport); tmpsk = pn_find_sock_by_sa(net, &try_sa); if (tmpsk == NULL) /* No sock there! We can use that port... */ goto found; else sock_put(tmpsk); } /* the port must be in use already */ return -EADDRINUSE; found: pn->sobject = pn_object(pn_addr(pn->sobject), sport); return 0; } EXPORT_SYMBOL(pn_sock_get_port); #ifdef CONFIG_PROC_FS static struct sock *pn_sock_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); struct hlist_head *hlist = pnsocks.hlist; struct sock *sknode; unsigned int h; for (h = 0; h < PN_HASHSIZE; h++) { sk_for_each_rcu(sknode, hlist) { if (!net_eq(net, sock_net(sknode))) continue; if (!pos) return sknode; pos--; } hlist++; } return NULL; } static struct sock *pn_sock_get_next(struct seq_file *seq, struct sock *sk) { struct net *net = seq_file_net(seq); do sk = sk_next(sk); while (sk && !net_eq(net, sock_net(sk))); return sk; } static void *pn_sock_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return *pos ? pn_sock_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *pn_sock_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = pn_sock_get_idx(seq, 0); else sk = pn_sock_get_next(seq, v); (*pos)++; return sk; } static void pn_sock_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { rcu_read_unlock(); } static int pn_sock_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) seq_puts(seq, "pt loc rem rs st tx_queue rx_queue " " uid inode ref pointer drops"); else { struct sock *sk = v; struct pn_sock *pn = pn_sk(sk); seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu " "%d %pK %u", sk->sk_protocol, pn->sobject, pn->dobject, pn->resource, sk->sk_state, sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk), from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, atomic_read(&sk->sk_drops)); } seq_pad(seq, '\n'); return 0; } const struct seq_operations pn_sock_seq_ops = { .start = pn_sock_seq_start, .next = pn_sock_seq_next, .stop = pn_sock_seq_stop, .show = pn_sock_seq_show, }; #endif static struct { struct sock *sk[256]; } pnres; /* * Find and hold socket based on resource. */ struct sock *pn_find_sock_by_res(struct net *net, u8 res) { struct sock *sk; if (!net_eq(net, &init_net)) return NULL; rcu_read_lock(); sk = rcu_dereference(pnres.sk[res]); if (sk) sock_hold(sk); rcu_read_unlock(); return sk; } static DEFINE_MUTEX(resource_mutex); int pn_sock_bind_res(struct sock *sk, u8 res) { int ret = -EADDRINUSE; if (!net_eq(sock_net(sk), &init_net)) return -ENOIOCTLCMD; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (pn_socket_autobind(sk->sk_socket)) return -EAGAIN; mutex_lock(&resource_mutex); if (pnres.sk[res] == NULL) { sock_hold(sk); rcu_assign_pointer(pnres.sk[res], sk); ret = 0; } mutex_unlock(&resource_mutex); return ret; } int pn_sock_unbind_res(struct sock *sk, u8 res) { int ret = -ENOENT; if (!capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&resource_mutex); if (pnres.sk[res] == sk) { RCU_INIT_POINTER(pnres.sk[res], NULL); ret = 0; } mutex_unlock(&resource_mutex); if (ret == 0) { synchronize_rcu(); sock_put(sk); } return ret; } void pn_sock_unbind_all_res(struct sock *sk) { unsigned int res, match = 0; mutex_lock(&resource_mutex); for (res = 0; res < 256; res++) { if (pnres.sk[res] == sk) { RCU_INIT_POINTER(pnres.sk[res], NULL); match++; } } mutex_unlock(&resource_mutex); while (match > 0) { __sock_put(sk); match--; } /* Caller is responsible for RCU sync before final sock_put() */ } #ifdef CONFIG_PROC_FS static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); unsigned int i; if (!net_eq(net, &init_net)) return NULL; for (i = 0; i < 256; i++) { if (pnres.sk[i] == NULL) continue; if (!pos) return pnres.sk + i; pos--; } return NULL; } static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk) { struct net *net = seq_file_net(seq); unsigned int i; BUG_ON(!net_eq(net, &init_net)); for (i = (sk - pnres.sk) + 1; i < 256; i++) if (pnres.sk[i]) return pnres.sk + i; return NULL; } static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos) __acquires(resource_mutex) { mutex_lock(&resource_mutex); return *pos ? pn_res_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock **sk; if (v == SEQ_START_TOKEN) sk = pn_res_get_idx(seq, 0); else sk = pn_res_get_next(seq, v); (*pos)++; return sk; } static void pn_res_seq_stop(struct seq_file *seq, void *v) __releases(resource_mutex) { mutex_unlock(&resource_mutex); } static int pn_res_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 63); if (v == SEQ_START_TOKEN) seq_puts(seq, "rs uid inode"); else { struct sock **psk = v; struct sock *sk = *psk; seq_printf(seq, "%02X %5u %lu", (int) (psk - pnres.sk), from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk)); } seq_pad(seq, '\n'); return 0; } const struct seq_operations pn_res_seq_ops = { .start = pn_res_seq_start, .next = pn_res_seq_next, .stop = pn_res_seq_stop, .show = pn_res_seq_show, }; #endif
56 56 56 56 56 24 24 24 18 17 18 18 18 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2022, SUSE. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include "protocol.h" static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, struct mptcp_sched_data *data) { struct sock *ssk; ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : mptcp_subflow_get_send(msk); if (!ssk) return -EINVAL; mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); return 0; } static struct mptcp_sched_ops mptcp_sched_default = { .get_subflow = mptcp_sched_default_get_subflow, .name = "default", .owner = THIS_MODULE, }; /* Must be called with rcu read lock held */ struct mptcp_sched_ops *mptcp_sched_find(const char *name) { struct mptcp_sched_ops *sched, *ret = NULL; list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { if (!strcmp(sched->name, name)) { ret = sched; break; } } return ret; } /* Build string with list of available scheduler values. * Similar to tcp_get_available_congestion_control() */ void mptcp_get_available_schedulers(char *buf, size_t maxlen) { struct mptcp_sched_ops *sched; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", sched->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } int mptcp_register_scheduler(struct mptcp_sched_ops *sched) { if (!sched->get_subflow) return -EINVAL; spin_lock(&mptcp_sched_list_lock); if (mptcp_sched_find(sched->name)) { spin_unlock(&mptcp_sched_list_lock); return -EEXIST; } list_add_tail_rcu(&sched->list, &mptcp_sched_list); spin_unlock(&mptcp_sched_list_lock); pr_debug("%s registered\n", sched->name); return 0; } void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) { if (sched == &mptcp_sched_default) return; spin_lock(&mptcp_sched_list_lock); list_del_rcu(&sched->list); spin_unlock(&mptcp_sched_list_lock); } void mptcp_sched_init(void) { mptcp_register_scheduler(&mptcp_sched_default); } int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched) { if (!sched) sched = &mptcp_sched_default; if (!bpf_try_module_get(sched, sched->owner)) return -EBUSY; msk->sched = sched; if (msk->sched->init) msk->sched->init(msk); pr_debug("sched=%s\n", msk->sched->name); return 0; } void mptcp_release_sched(struct mptcp_sock *msk) { struct mptcp_sched_ops *sched = msk->sched; if (!sched) return; msk->sched = NULL; if (sched->release) sched->release(msk); bpf_module_put(sched, sched->owner); } void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, bool scheduled) { WRITE_ONCE(subflow->scheduled, scheduled); } int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct mptcp_sched_data data; msk_owned_by_me(msk); /* the following check is moved out of mptcp_subflow_get_send */ if (__mptcp_check_fallback(msk)) { if (msk->first && __tcp_can_send(msk->first) && sk_stream_memory_free(msk->first)) { mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true); return 0; } return -EINVAL; } mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; } data.reinject = false; if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_subflow(msk, &data); return msk->sched->get_subflow(msk, &data); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct mptcp_sched_data data; msk_owned_by_me(msk); /* the following check is moved out of mptcp_subflow_get_retrans */ if (__mptcp_check_fallback(msk)) return -EINVAL; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; } data.reinject = true; if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_subflow(msk, &data); return msk->sched->get_subflow(msk, &data); }
558 558 45 45 424 134 7 7 4 3 7 4 4 4 4 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 // SPDX-License-Identifier: GPL-2.0 /* * class.c - basic device class management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2003-2004 Greg Kroah-Hartman * Copyright (c) 2003-2004 IBM Corp. */ #include <linux/device/class.h> #include <linux/device.h> #include <linux/module.h> #include <linux/init.h> #include <linux/string.h> #include <linux/kdev_t.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "base.h" /* /sys/class */ static struct kset *class_kset; #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr) /** * class_to_subsys - Turn a struct class into a struct subsys_private * * @class: pointer to the struct bus_type to look up * * The driver core internals need to work on the subsys_private structure, not * the external struct class pointer. This function walks the list of * registered classes in the system and finds the matching one and returns the * internal struct subsys_private that relates to that class. * * Note, the reference count of the return value is INCREMENTED if it is not * NULL. A call to subsys_put() must be done when finished with the pointer in * order for it to be properly freed. */ struct subsys_private *class_to_subsys(const struct class *class) { struct subsys_private *sp = NULL; struct kobject *kobj; if (!class || !class_kset) return NULL; spin_lock(&class_kset->list_lock); if (list_empty(&class_kset->list)) goto done; list_for_each_entry(kobj, &class_kset->list, entry) { struct kset *kset = container_of(kobj, struct kset, kobj); sp = container_of_const(kset, struct subsys_private, subsys); if (sp->class == class) goto done; } sp = NULL; done: sp = subsys_get(sp); spin_unlock(&class_kset->list_lock); return sp; } static ssize_t class_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct class_attribute *class_attr = to_class_attr(attr); struct subsys_private *cp = to_subsys_private(kobj); ssize_t ret = -EIO; if (class_attr->show) ret = class_attr->show(cp->class, class_attr, buf); return ret; } static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct class_attribute *class_attr = to_class_attr(attr); struct subsys_private *cp = to_subsys_private(kobj); ssize_t ret = -EIO; if (class_attr->store) ret = class_attr->store(cp->class, class_attr, buf, count); return ret; } static void class_release(struct kobject *kobj) { struct subsys_private *cp = to_subsys_private(kobj); const struct class *class = cp->class; pr_debug("class '%s': release.\n", class->name); if (class->class_release) class->class_release(class); else pr_debug("class '%s' does not have a release() function, " "be careful\n", class->name); lockdep_unregister_key(&cp->lock_key); kfree(cp); } static const struct kobj_ns_type_operations *class_child_ns_type(const struct kobject *kobj) { const struct subsys_private *cp = to_subsys_private(kobj); const struct class *class = cp->class; return class->ns_type; } static const struct sysfs_ops class_sysfs_ops = { .show = class_attr_show, .store = class_attr_store, }; static const struct kobj_type class_ktype = { .sysfs_ops = &class_sysfs_ops, .release = class_release, .child_ns_type = class_child_ns_type, }; int class_create_file_ns(const struct class *cls, const struct class_attribute *attr, const void *ns) { struct subsys_private *sp = class_to_subsys(cls); int error; if (!sp) return -EINVAL; error = sysfs_create_file_ns(&sp->subsys.kobj, &attr->attr, ns); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(class_create_file_ns); void class_remove_file_ns(const struct class *cls, const struct class_attribute *attr, const void *ns) { struct subsys_private *sp = class_to_subsys(cls); if (!sp) return; sysfs_remove_file_ns(&sp->subsys.kobj, &attr->attr, ns); subsys_put(sp); } EXPORT_SYMBOL_GPL(class_remove_file_ns); static struct device *klist_class_to_dev(struct klist_node *n) { struct device_private *p = to_device_private_class(n); return p->device; } static void klist_class_dev_get(struct klist_node *n) { struct device *dev = klist_class_to_dev(n); get_device(dev); } static void klist_class_dev_put(struct klist_node *n) { struct device *dev = klist_class_to_dev(n); put_device(dev); } int class_register(const struct class *cls) { struct subsys_private *cp; struct lock_class_key *key; int error; pr_debug("device class '%s': registering\n", cls->name); if (cls->ns_type && !cls->namespace) { pr_err("%s: class '%s' does not have namespace\n", __func__, cls->name); return -EINVAL; } if (!cls->ns_type && cls->namespace) { pr_err("%s: class '%s' does not have ns_type\n", __func__, cls->name); return -EINVAL; } cp = kzalloc(sizeof(*cp), GFP_KERNEL); if (!cp) return -ENOMEM; klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put); INIT_LIST_HEAD(&cp->interfaces); kset_init(&cp->glue_dirs); key = &cp->lock_key; lockdep_register_key(key); __mutex_init(&cp->mutex, "subsys mutex", key); error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name); if (error) goto err_out; cp->subsys.kobj.kset = class_kset; cp->subsys.kobj.ktype = &class_ktype; cp->class = cls; error = kset_register(&cp->subsys); if (error) goto err_out; error = sysfs_create_groups(&cp->subsys.kobj, cls->class_groups); if (error) { kobject_del(&cp->subsys.kobj); kfree_const(cp->subsys.kobj.name); goto err_out; } return 0; err_out: lockdep_unregister_key(key); kfree(cp); return error; } EXPORT_SYMBOL_GPL(class_register); void class_unregister(const struct class *cls) { struct subsys_private *sp = class_to_subsys(cls); if (!sp) return; pr_debug("device class '%s': unregistering\n", cls->name); sysfs_remove_groups(&sp->subsys.kobj, cls->class_groups); kset_unregister(&sp->subsys); subsys_put(sp); } EXPORT_SYMBOL_GPL(class_unregister); static void class_create_release(const struct class *cls) { pr_debug("%s called for %s\n", __func__, cls->name); kfree(cls); } /** * class_create - create a struct class structure * @name: pointer to a string for the name of this class. * * This is used to create a struct class pointer that can then be used * in calls to device_create(). * * Returns &struct class pointer on success, or ERR_PTR() on error. * * Note, the pointer created here is to be destroyed when finished by * making a call to class_destroy(). */ struct class *class_create(const char *name) { struct class *cls; int retval; cls = kzalloc(sizeof(*cls), GFP_KERNEL); if (!cls) { retval = -ENOMEM; goto error; } cls->name = name; cls->class_release = class_create_release; retval = class_register(cls); if (retval) goto error; return cls; error: kfree(cls); return ERR_PTR(retval); } EXPORT_SYMBOL_GPL(class_create); /** * class_destroy - destroys a struct class structure * @cls: pointer to the struct class that is to be destroyed * * Note, the pointer to be destroyed must have been created with a call * to class_create(). */ void class_destroy(const struct class *cls) { if (IS_ERR_OR_NULL(cls)) return; class_unregister(cls); } EXPORT_SYMBOL_GPL(class_destroy); /** * class_dev_iter_init - initialize class device iterator * @iter: class iterator to initialize * @class: the class we wanna iterate over * @start: the device to start iterating from, if any * @type: device_type of the devices to iterate over, NULL for all * * Initialize class iterator @iter such that it iterates over devices * of @class. If @start is set, the list iteration will start there, * otherwise if it is NULL, the iteration starts at the beginning of * the list. */ void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class, const struct device *start, const struct device_type *type) { struct subsys_private *sp = class_to_subsys(class); struct klist_node *start_knode = NULL; if (!sp) return; if (start) start_knode = &start->p->knode_class; klist_iter_init_node(&sp->klist_devices, &iter->ki, start_knode); iter->type = type; iter->sp = sp; } EXPORT_SYMBOL_GPL(class_dev_iter_init); /** * class_dev_iter_next - iterate to the next device * @iter: class iterator to proceed * * Proceed @iter to the next device and return it. Returns NULL if * iteration is complete. * * The returned device is referenced and won't be released till * iterator is proceed to the next device or exited. The caller is * free to do whatever it wants to do with the device including * calling back into class code. */ struct device *class_dev_iter_next(struct class_dev_iter *iter) { struct klist_node *knode; struct device *dev; while (1) { knode = klist_next(&iter->ki); if (!knode) return NULL; dev = klist_class_to_dev(knode); if (!iter->type || iter->type == dev->type) return dev; } } EXPORT_SYMBOL_GPL(class_dev_iter_next); /** * class_dev_iter_exit - finish iteration * @iter: class iterator to finish * * Finish an iteration. Always call this function after iteration is * complete whether the iteration ran till the end or not. */ void class_dev_iter_exit(struct class_dev_iter *iter) { klist_iter_exit(&iter->ki); subsys_put(iter->sp); } EXPORT_SYMBOL_GPL(class_dev_iter_exit); /** * class_for_each_device - device iterator * @class: the class we're iterating * @start: the device to start with in the list, if any. * @data: data for the callback * @fn: function to be called for each device * * Iterate over @class's list of devices, and call @fn for each, * passing it @data. If @start is set, the list iteration will start * there, otherwise if it is NULL, the iteration starts at the * beginning of the list. * * We check the return of @fn each time. If it returns anything * other than 0, we break out and return that value. * * @fn is allowed to do anything including calling back into class * code. There's no locking restriction. */ int class_for_each_device(const struct class *class, const struct device *start, void *data, int (*fn)(struct device *, void *)) { struct subsys_private *sp = class_to_subsys(class); struct class_dev_iter iter; struct device *dev; int error = 0; if (!class) return -EINVAL; if (!sp) { WARN(1, "%s called for class '%s' before it was registered", __func__, class->name); return -EINVAL; } class_dev_iter_init(&iter, class, start, NULL); while ((dev = class_dev_iter_next(&iter))) { error = fn(dev, data); if (error) break; } class_dev_iter_exit(&iter); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(class_for_each_device); /** * class_find_device - device iterator for locating a particular device * @class: the class we're iterating * @start: Device to begin with * @data: data for the match function * @match: function to check device * * This is similar to the class_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. * * Note, you will need to drop the reference with put_device() after use. * * @match is allowed to do anything including calling back into class * code. There's no locking restriction. */ struct device *class_find_device(const struct class *class, const struct device *start, const void *data, device_match_t match) { struct subsys_private *sp = class_to_subsys(class); struct class_dev_iter iter; struct device *dev; if (!class) return NULL; if (!sp) { WARN(1, "%s called for class '%s' before it was registered", __func__, class->name); return NULL; } class_dev_iter_init(&iter, class, start, NULL); while ((dev = class_dev_iter_next(&iter))) { if (match(dev, data)) { get_device(dev); break; } } class_dev_iter_exit(&iter); subsys_put(sp); return dev; } EXPORT_SYMBOL_GPL(class_find_device); int class_interface_register(struct class_interface *class_intf) { struct subsys_private *sp; const struct class *parent; struct class_dev_iter iter; struct device *dev; if (!class_intf || !class_intf->class) return -ENODEV; parent = class_intf->class; sp = class_to_subsys(parent); if (!sp) return -EINVAL; /* * Reference in sp is now incremented and will be dropped when * the interface is removed in the call to class_interface_unregister() */ mutex_lock(&sp->mutex); list_add_tail(&class_intf->node, &sp->interfaces); if (class_intf->add_dev) { class_dev_iter_init(&iter, parent, NULL, NULL); while ((dev = class_dev_iter_next(&iter))) class_intf->add_dev(dev); class_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); return 0; } EXPORT_SYMBOL_GPL(class_interface_register); void class_interface_unregister(struct class_interface *class_intf) { struct subsys_private *sp; const struct class *parent = class_intf->class; struct class_dev_iter iter; struct device *dev; if (!parent) return; sp = class_to_subsys(parent); if (!sp) return; mutex_lock(&sp->mutex); list_del_init(&class_intf->node); if (class_intf->remove_dev) { class_dev_iter_init(&iter, parent, NULL, NULL); while ((dev = class_dev_iter_next(&iter))) class_intf->remove_dev(dev); class_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); /* * Decrement the reference count twice, once for the class_to_subsys() * call in the start of this function, and the second one from the * reference increment in class_interface_register() */ subsys_put(sp); subsys_put(sp); } EXPORT_SYMBOL_GPL(class_interface_unregister); ssize_t show_class_attr_string(const struct class *class, const struct class_attribute *attr, char *buf) { struct class_attribute_string *cs; cs = container_of(attr, struct class_attribute_string, attr); return sysfs_emit(buf, "%s\n", cs->str); } EXPORT_SYMBOL_GPL(show_class_attr_string); struct class_compat { struct kobject *kobj; }; /** * class_compat_register - register a compatibility class * @name: the name of the class * * Compatibility class are meant as a temporary user-space compatibility * workaround when converting a family of class devices to a bus devices. */ struct class_compat *class_compat_register(const char *name) { struct class_compat *cls; cls = kmalloc(sizeof(struct class_compat), GFP_KERNEL); if (!cls) return NULL; cls->kobj = kobject_create_and_add(name, &class_kset->kobj); if (!cls->kobj) { kfree(cls); return NULL; } return cls; } EXPORT_SYMBOL_GPL(class_compat_register); /** * class_compat_unregister - unregister a compatibility class * @cls: the class to unregister */ void class_compat_unregister(struct class_compat *cls) { kobject_put(cls->kobj); kfree(cls); } EXPORT_SYMBOL_GPL(class_compat_unregister); /** * class_compat_create_link - create a compatibility class device link to * a bus device * @cls: the compatibility class * @dev: the target bus device * @device_link: an optional device to which a "device" link should be created */ int class_compat_create_link(struct class_compat *cls, struct device *dev, struct device *device_link) { int error; error = sysfs_create_link(cls->kobj, &dev->kobj, dev_name(dev)); if (error) return error; /* * Optionally add a "device" link (typically to the parent), as a * class device would have one and we want to provide as much * backwards compatibility as possible. */ if (device_link) { error = sysfs_create_link(&dev->kobj, &device_link->kobj, "device"); if (error) sysfs_remove_link(cls->kobj, dev_name(dev)); } return error; } EXPORT_SYMBOL_GPL(class_compat_create_link); /** * class_compat_remove_link - remove a compatibility class device link to * a bus device * @cls: the compatibility class * @dev: the target bus device * @device_link: an optional device to which a "device" link was previously * created */ void class_compat_remove_link(struct class_compat *cls, struct device *dev, struct device *device_link) { if (device_link) sysfs_remove_link(&dev->kobj, "device"); sysfs_remove_link(cls->kobj, dev_name(dev)); } EXPORT_SYMBOL_GPL(class_compat_remove_link); /** * class_is_registered - determine if at this moment in time, a class is * registered in the driver core or not. * @class: the class to check * * Returns a boolean to state if the class is registered in the driver core * or not. Note that the value could switch right after this call is made, * so only use this in places where you "know" it is safe to do so (usually * to determine if the specific class has been registered yet or not). * * Be careful in using this. */ bool class_is_registered(const struct class *class) { struct subsys_private *sp = class_to_subsys(class); bool is_initialized = false; if (sp) { is_initialized = true; subsys_put(sp); } return is_initialized; } EXPORT_SYMBOL_GPL(class_is_registered); int __init classes_init(void) { class_kset = kset_create_and_add("class", NULL, NULL); if (!class_kset) return -ENOMEM; return 0; }
122 59 2 2 33 23 33 14 1 6 6 12 2 10 12 2 2 41 41 41 44 44 1 2 44 14 14 1 1 2 2 2 12 12 14 3 3 6 6 6 3 3 6 19 19 18 13 7 11 56 56 8 62 62 30 3 33 35 21 44 44 44 33 13 13 44 19 3 3 13 21 33 49 53 31 49 41 12 12 41 41 12 12 12 41 41 187 187 187 183 177 8 4 178 4 4 178 4 4 178 178 180 171 180 661 662 661 296 579 663 579 13 1 1 7 10 2 6 3 202 170 171 41 41 82 82 27 200 203 202 1 203 203 203 203 200 202 202 203 203 1 203 203 203 203 203 203 203 203 193 42 203 3 63 173 2 115 5 1 3 1 10 10 10 10 10 10 3 10 10 41 41 173 59 117 1 49 49 1 219 201 202 55 4 16 16 17 16 17 1 16 300 278 1 21 229 2 3 53 12 3 62 55 236 288 270 17 1 16 109 6 264 50 220 2 266 269 15 7 1 11 23 35 225 257 259 258 260 260 257 244 14 260 1 41 2 201 243 228 14 3 3 45 19 300 104 301 297 47 45 3 47 47 1 41 41 174 300 328 1 327 327 1 1 1 169 155 300 1 298 193 164 32 2 26 20 2 5 2 12 59 118 167 7 64 104 168 7 3 1 3 3 1 3 4 36 36 4 136 107 136 32 1 1 1 5 2 15 4 13 5 8 1 4 3 1 45 20 1 2 3 2 3 2 1 2 10 418 208 13 220 291 2 25 1 1 23 176 3 1 17 282 2 3 2 2 4 5 1 1 3 2 1 6 1 2 1 2 45 5 7 2 13 4 2 1 1 1 1 2 1 1 1 2 2 2 5 1 6 2 1 136 1 2 4 4 1 4 1 6 286 418 10 5 6 91 91 53 53 19 5 2 1 1 5 1 7 1 3 20 17 20 20 1007 722 288 20 82 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 // SPDX-License-Identifier: GPL-2.0-or-later /* * TUN - Universal TUN/TAP device driver. * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com> * * $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $ */ /* * Changes: * * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14 * Add TUNSETLINK ioctl to set the link encapsulation * * Mark Smith <markzzzsmith@yahoo.com.au> * Use eth_random_addr() for tap MAC address. * * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 * Fixes in packet dropping, queue length setting and queue wakeup. * Increased default tx queue length. * Added ethtool API. * Minor cleanups * * Daniel Podlejski <underley@underley.eu.org> * Modifications for 2.3.99-pre5 kernel. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define DRV_NAME "tun" #define DRV_VERSION "1.6" #define DRV_DESCRIPTION "Universal TUN/TAP device driver" #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/fcntl.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/miscdevice.h> #include <linux/ethtool.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_tun.h> #include <linux/if_vlan.h> #include <linux/crc32.h> #include <linux/math.h> #include <linux/nsproxy.h> #include <linux/virtio_net.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <net/ip_tunnels.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/skb_array.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/mutex.h> #include <linux/ieee802154.h> #include <uapi/linux/if_ltalk.h> #include <uapi/linux/if_fddi.h> #include <uapi/linux/if_hippi.h> #include <uapi/linux/if_fc.h> #include <net/ax25.h> #include <net/rose.h> #include <net/6lowpan.h> #include <net/rps.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* TUN device flags */ /* IFF_ATTACH_QUEUE is never stored in device flags, * overload it to mean fasync when stored there. */ #define TUN_FASYNC IFF_ATTACH_QUEUE /* High bits in flags field are unused. */ #define TUN_VNET_LE 0x80000000 #define TUN_VNET_BE 0x40000000 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 struct tap_filter { unsigned int count; /* Number of addrs. Zero means disabled */ u32 mask[2]; /* Mask of the hashed addrs */ unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; /* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal * to max number of VCPUs in guest. */ #define MAX_TAP_QUEUES 256 #define MAX_TAP_FLOWS 4096 #define TUN_FLOW_EXPIRE (3 * HZ) /* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and * tap_filter were kept in tun_struct since they were used for filtering for the * netdevice not for a specific queue (at least I didn't see the requirement for * this). * * RCU usage: * The tun_file and tun_struct are loosely coupled, the pointer from one to the * other can only be read while rcu_read_lock or rtnl_lock is held. */ struct tun_file { struct sock sk; struct socket socket; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; union { u16 queue_index; unsigned int ifindex; }; struct napi_struct napi; bool napi_enabled; bool napi_frags_enabled; struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; }; struct tun_page { struct page *page; int count; }; struct tun_flow_entry { struct hlist_node hash_link; struct rcu_head rcu; struct tun_struct *tun; u32 rxhash; u32 rps_rxhash; int queue_index; unsigned long updated ____cacheline_aligned_in_smp; }; #define TUN_NUM_FLOW_ENTRIES 1024 #define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1) struct tun_prog { struct rcu_head rcu; struct bpf_prog *prog; }; /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. */ struct tun_struct { struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; unsigned int numqueues; unsigned int flags; kuid_t owner; kgid_t group; struct net_device *dev; netdev_features_t set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4) int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; struct sock_fprog fprog; /* protected by rtnl lock */ bool filter_attached; u32 msg_enable; spinlock_t lock; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; void *security; u32 flow_count; u32 rx_batched; atomic_long_t rx_frame_errors; struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; struct ifreq *ifr; }; struct veth { __be16 h_vlan_proto; __be16 h_vlan_TCI; }; static void tun_flow_init(struct tun_struct *tun); static void tun_flow_uninit(struct tun_struct *tun); static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; struct sk_buff *skb; int received = 0; __skb_queue_head_init(&process_queue); spin_lock(&queue->lock); skb_queue_splice_tail_init(queue, &process_queue); spin_unlock(&queue->lock); while (received < budget && (skb = __skb_dequeue(&process_queue))) { napi_gro_receive(napi, skb); ++received; } if (!skb_queue_empty(&process_queue)) { spin_lock(&queue->lock); skb_queue_splice(&process_queue, queue); spin_unlock(&queue->lock); } return received; } static int tun_napi_poll(struct napi_struct *napi, int budget) { unsigned int received; received = tun_napi_receive(napi, budget); if (received < budget) napi_complete_done(napi, received); return received; } static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, bool napi_en, bool napi_frags) { tfile->napi_enabled = napi_en; tfile->napi_frags_enabled = napi_en && napi_frags; if (napi_en) { netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll); napi_enable(&tfile->napi); } } static void tun_napi_enable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_enable(&tfile->napi); } static void tun_napi_disable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_disable(&tfile->napi); } static void tun_napi_del(struct tun_file *tfile) { if (tfile->napi_enabled) netif_napi_del(&tfile->napi); } static bool tun_napi_frags_enabled(const struct tun_file *tfile) { return tfile->napi_frags_enabled; } #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { return tun->flags & TUN_VNET_BE ? false : virtio_legacy_is_little_endian(); } static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) { int be = !!(tun->flags & TUN_VNET_BE); if (put_user(be, argp)) return -EFAULT; return 0; } static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) { int be; if (get_user(be, argp)) return -EFAULT; if (be) tun->flags |= TUN_VNET_BE; else tun->flags &= ~TUN_VNET_BE; return 0; } #else static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { return virtio_legacy_is_little_endian(); } static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) { return -EINVAL; } static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) { return -EINVAL; } #endif /* CONFIG_TUN_VNET_CROSS_LE */ static inline bool tun_is_little_endian(struct tun_struct *tun) { return tun->flags & TUN_VNET_LE || tun_legacy_is_little_endian(tun); } static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val) { return __virtio16_to_cpu(tun_is_little_endian(tun), val); } static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val) { return __cpu_to_virtio16(tun_is_little_endian(tun), val); } static inline u32 tun_hashfn(u32 rxhash) { return rxhash & TUN_MASK_FLOW_ENTRIES; } static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) { struct tun_flow_entry *e; hlist_for_each_entry_rcu(e, head, hash_link) { if (e->rxhash == rxhash) return e; } return NULL; } static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, struct hlist_head *head, u32 rxhash, u16 queue_index) { struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e) { netif_info(tun, tx_queued, tun->dev, "create flow: hash %u index %u\n", rxhash, queue_index); e->updated = jiffies; e->rxhash = rxhash; e->rps_rxhash = 0; e->queue_index = queue_index; e->tun = tun; hlist_add_head_rcu(&e->hash_link, head); ++tun->flow_count; } return e; } static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n", e->rxhash, e->queue_index); hlist_del_rcu(&e->hash_link); kfree_rcu(e, rcu); --tun->flow_count; } static void tun_flow_flush(struct tun_struct *tun) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) tun_flow_delete(tun, e); } spin_unlock_bh(&tun->lock); } static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { if (e->queue_index == queue_index) tun_flow_delete(tun, e); } } spin_unlock_bh(&tun->lock); } static void tun_flow_cleanup(struct timer_list *t) { struct tun_struct *tun = from_timer(tun, t, flow_gc_timer); unsigned long delay = tun->ageing_time; unsigned long next_timer = jiffies + delay; unsigned long count = 0; int i; spin_lock(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { unsigned long this_timer; this_timer = e->updated + delay; if (time_before_eq(this_timer, jiffies)) { tun_flow_delete(tun, e); continue; } count++; if (time_before(this_timer, next_timer)) next_timer = this_timer; } } if (count) mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); spin_unlock(&tun->lock); } static void tun_flow_update(struct tun_struct *tun, u32 rxhash, struct tun_file *tfile) { struct hlist_head *head; struct tun_flow_entry *e; unsigned long delay = tun->ageing_time; u16 queue_index = tfile->queue_index; head = &tun->flows[tun_hashfn(rxhash)]; rcu_read_lock(); e = tun_flow_find(head, rxhash); if (likely(e)) { /* TODO: keep queueing to old queue until it's empty? */ if (READ_ONCE(e->queue_index) != queue_index) WRITE_ONCE(e->queue_index, queue_index); if (e->updated != jiffies) e->updated = jiffies; sock_rps_record_flow_hash(e->rps_rxhash); } else { spin_lock_bh(&tun->lock); if (!tun_flow_find(head, rxhash) && tun->flow_count < MAX_TAP_FLOWS) tun_flow_create(tun, head, rxhash, queue_index); if (!timer_pending(&tun->flow_gc_timer)) mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + delay)); spin_unlock_bh(&tun->lock); } rcu_read_unlock(); } /* Save the hash received in the stack receive path and update the * flow_hash table accordingly. */ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) { if (unlikely(e->rps_rxhash != hash)) e->rps_rxhash = hash; } /* We try to identify a flow through its rxhash. The reason that * we do not check rxq no. is because some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As * the userspace application move between processors, we may get a * different rxq no. here. */ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_flow_entry *e; u32 txq, numqueues; numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); if (e) { tun_flow_save_rps_rxhash(e, txq); txq = e->queue_index; } else { txq = reciprocal_scale(txq, numqueues); } return txq; } static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_prog *prog; u32 numqueues; u16 ret = 0; numqueues = READ_ONCE(tun->numqueues); if (!numqueues) return 0; prog = rcu_dereference(tun->steering_prog); if (prog) ret = bpf_prog_run_clear_cb(prog->prog, skb); return ret % numqueues; } static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct tun_struct *tun = netdev_priv(dev); u16 ret; rcu_read_lock(); if (rcu_dereference(tun->steering_prog)) ret = tun_ebpf_select_queue(tun, skb); else ret = tun_automq_select_queue(tun, skb); rcu_read_unlock(); return ret; } static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) || (gid_valid(tun->group) && !in_egroup_p(tun->group))) && !ns_capable(net->user_ns, CAP_NET_ADMIN); } static void tun_set_real_num_queues(struct tun_struct *tun) { netif_set_real_num_tx_queues(tun->dev, tun->numqueues); netif_set_real_num_rx_queues(tun->dev, tun->numqueues); } static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile) { tfile->detached = tun; list_add_tail(&tfile->next, &tun->disabled); ++tun->numdisabled; } static struct tun_struct *tun_enable_queue(struct tun_file *tfile) { struct tun_struct *tun = tfile->detached; tfile->detached = NULL; list_del_init(&tfile->next); --tun->numdisabled; return tun; } void tun_ptr_free(void *ptr) { if (!ptr) return; if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); xdp_return_frame(xdpf); } else { __skb_array_destroy_skb(ptr); } } EXPORT_SYMBOL_GPL(tun_ptr_free); static void tun_queue_purge(struct tun_file *tfile) { void *ptr; while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) tun_ptr_free(ptr); skb_queue_purge(&tfile->sk.sk_write_queue); skb_queue_purge(&tfile->sk.sk_error_queue); } static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; struct tun_struct *tun; tun = rtnl_dereference(tfile->tun); if (tun && clean) { if (!tfile->detached) tun_napi_disable(tfile); tun_napi_del(tfile); } if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); rcu_assign_pointer(tun->tfiles[index], tun->tfiles[tun->numqueues - 1]); ntfile = rtnl_dereference(tun->tfiles[index]); ntfile->queue_index = index; ntfile->xdp_rxq.queue_index = index; rcu_assign_pointer(tun->tfiles[tun->numqueues - 1], NULL); --tun->numqueues; if (clean) { RCU_INIT_POINTER(tfile->tun, NULL); sock_put(&tfile->sk); } else { tun_disable_queue(tun, tfile); tun_napi_disable(tfile); } synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); /* Drop read queue */ tun_queue_purge(tfile); tun_set_real_num_queues(tun); } else if (tfile->detached && clean) { tun = tun_enable_queue(tfile); sock_put(&tfile->sk); } if (clean) { if (tun && tun->numqueues == 0 && tun->numdisabled == 0) { netif_carrier_off(tun->dev); if (!(tun->flags & IFF_PERSIST) && tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } if (tun) xdp_rxq_info_unreg(&tfile->xdp_rxq); ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); } } static void tun_detach(struct tun_file *tfile, bool clean) { struct tun_struct *tun; struct net_device *dev; rtnl_lock(); tun = rtnl_dereference(tfile->tun); dev = tun ? tun->dev : NULL; __tun_detach(tfile, clean); if (dev) netdev_state_change(dev); rtnl_unlock(); if (clean) sock_put(&tfile->sk); } static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); tun_napi_disable(tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); --tun->numqueues; } list_for_each_entry(tfile, &tun->disabled, next) { tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); } BUG_ON(tun->numqueues != 0); synchronize_net(); for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tun_napi_del(tfile); /* Drop read queue */ tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_napi_del(tfile); tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter, bool napi, bool napi_frags, bool publish_tun) { struct tun_file *tfile = file->private_data; struct net_device *dev = tun->dev; int err; err = security_tun_dev_attach(tfile->socket.sk, tun->security); if (err < 0) goto out; err = -EINVAL; if (rtnl_dereference(tfile->tun) && !tfile->detached) goto out; err = -EBUSY; if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1) goto out; err = -E2BIG; if (!tfile->detached && tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES) goto out; err = 0; /* Re-attach the filter to persist device */ if (!skip_filter && (tun->filter_attached == true)) { lock_sock(tfile->socket.sk); err = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (!err) goto out; } if (!tfile->detached && ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free)) { err = -ENOMEM; goto out; } tfile->queue_index = tun->numqueues; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; if (tfile->detached) { /* Re-attach detached tfile, updating XDP queue_index */ WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq)); if (tfile->xdp_rxq.queue_index != tfile->queue_index) tfile->xdp_rxq.queue_index = tfile->queue_index; } else { /* Setup XDP RX-queue info, for new tfile getting attached */ err = xdp_rxq_info_reg(&tfile->xdp_rxq, tun->dev, tfile->queue_index, 0); if (err < 0) goto out; err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) { xdp_rxq_info_unreg(&tfile->xdp_rxq); goto out; } err = 0; } if (tfile->detached) { tun_enable_queue(tfile); tun_napi_enable(tfile); } else { sock_hold(&tfile->sk); tun_napi_init(tun, tfile, napi, napi_frags); } if (rtnl_dereference(tun->xdp_prog)) sock_set_flag(&tfile->sk, SOCK_XDP); /* device is allowed to go away first, so no need to hold extra * refcnt. */ /* Publish tfile->tun and tun->tfiles only after we've fully * initialized tfile; otherwise we risk using half-initialized * object. */ if (publish_tun) rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; tun_set_real_num_queues(tun); out: return err; } static struct tun_struct *tun_get(struct tun_file *tfile) { struct tun_struct *tun; rcu_read_lock(); tun = rcu_dereference(tfile->tun); if (tun) dev_hold(tun->dev); rcu_read_unlock(); return tun; } static void tun_put(struct tun_struct *tun) { dev_put(tun->dev); } /* TAP filtering */ static void addr_hash_set(u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; mask[n >> 5] |= (1 << (n & 31)); } static unsigned int addr_hash_test(const u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; return mask[n >> 5] & (1 << (n & 31)); } static int update_filter(struct tap_filter *filter, void __user *arg) { struct { u8 u[ETH_ALEN]; } *addr; struct tun_filter uf; int err, alen, n, nexact; if (copy_from_user(&uf, arg, sizeof(uf))) return -EFAULT; if (!uf.count) { /* Disabled */ filter->count = 0; return 0; } alen = ETH_ALEN * uf.count; addr = memdup_user(arg + sizeof(uf), alen); if (IS_ERR(addr)) return PTR_ERR(addr); /* The filter is updated without holding any locks. Which is * perfectly safe. We disable it first and in the worst * case we'll accept a few undesired packets. */ filter->count = 0; wmb(); /* Use first set of addresses as an exact filter */ for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++) memcpy(filter->addr[n], addr[n].u, ETH_ALEN); nexact = n; /* Remaining multicast addresses are hashed, * unicast will leave the filter disabled. */ memset(filter->mask, 0, sizeof(filter->mask)); for (; n < uf.count; n++) { if (!is_multicast_ether_addr(addr[n].u)) { err = 0; /* no filter */ goto free_addr; } addr_hash_set(filter->mask, addr[n].u); } /* For ALLMULTI just set the mask to all ones. * This overrides the mask populated above. */ if ((uf.flags & TUN_FLT_ALLMULTI)) memset(filter->mask, ~0, sizeof(filter->mask)); /* Now enable the filter */ wmb(); filter->count = nexact; /* Return the number of exact filters */ err = nexact; free_addr: kfree(addr); return err; } /* Returns: 0 - drop, !=0 - accept */ static int run_filter(struct tap_filter *filter, const struct sk_buff *skb) { /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect * at this point. */ struct ethhdr *eh = (struct ethhdr *) skb->data; int i; /* Exact match */ for (i = 0; i < filter->count; i++) if (ether_addr_equal(eh->h_dest, filter->addr[i])) return 1; /* Inexact match (multicast only) */ if (is_multicast_ether_addr(eh->h_dest)) return addr_hash_test(filter->mask, eh->h_dest); return 0; } /* * Checks whether the packet is accepted or not. * Returns: 0 - drop, !=0 - accept */ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb) { if (!filter->count) return 1; return run_filter(filter, skb); } /* Network device part of the driver */ static const struct ethtool_ops tun_ethtool_ops; static int tun_net_init(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct ifreq *ifr = tun->ifr; int err; spin_lock_init(&tun->lock); err = security_tun_dev_alloc_security(&tun->security); if (err < 0) return err; tun_flow_init(tun); dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->features = dev->hw_features; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); dev->lltx = true; tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, false); if (err < 0) { tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); return err; } return 0; } /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { tun_detach_all(dev); } /* Net device open. */ static int tun_net_open(struct net_device *dev) { netif_tx_start_all_queues(dev); return 0; } /* Net device close. */ static int tun_net_close(struct net_device *dev) { netif_tx_stop_all_queues(dev); return 0; } /* Net device start xmit */ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { #ifdef CONFIG_RPS if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ struct tun_flow_entry *e; __u32 rxhash; rxhash = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash); if (e) tun_flow_save_rps_rxhash(e, rxhash); } #endif } static unsigned int run_ebpf_filter(struct tun_struct *tun, struct sk_buff *skb, int len) { struct tun_prog *prog = rcu_dereference(tun->filter_prog); if (prog) len = bpf_prog_run_clear_cb(prog->prog, skb); return len; } /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); enum skb_drop_reason drop_reason; int txq = skb->queue_mapping; struct netdev_queue *queue; struct tun_file *tfile; int len = skb->len; rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); /* Drop packet if interface is not attached */ if (!tfile) { drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (!rcu_dereference(tun->steering_prog)) tun_automq_xmit(tun, skb); netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len); /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ if (!check_filter(&tun->txflt, skb)) { drop_reason = SKB_DROP_REASON_TAP_TXFILTER; goto drop; } if (tfile->socket.sk->sk_filter && sk_filter(tfile->socket.sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto drop; } len = run_ebpf_filter(tun, skb, len); if (len == 0) { drop_reason = SKB_DROP_REASON_TAP_FILTER; goto drop; } if (pskb_trim(skb, len)) { drop_reason = SKB_DROP_REASON_NOMEM; goto drop; } if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. */ skb_orphan(skb); nf_reset_ct(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } /* dev->lltx requires to do our own update of trans_start */ queue = netdev_get_tx_queue(dev, txq); txq_trans_cond_update(queue); /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); rcu_read_unlock(); return NETDEV_TX_OK; drop: dev_core_stats_tx_dropped_inc(dev); skb_tx_error(skb); kfree_skb_reason(skb, drop_reason); rcu_read_unlock(); return NET_XMIT_DROP; } static void tun_net_mclist(struct net_device *dev) { /* * This callback is supposed to deal with mc filter in * _rx_ path and has nothing to do with the _tx_ path. * In rx path we always accept everything userspace gives us. */ } static netdev_features_t tun_net_fix_features(struct net_device *dev, netdev_features_t features) { struct tun_struct *tun = netdev_priv(dev); return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } static void tun_set_headroom(struct net_device *dev, int new_hr) { struct tun_struct *tun = netdev_priv(dev); if (new_hr < NET_SKB_PAD) new_hr = NET_SKB_PAD; tun->align = new_hr; } static void tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct tun_struct *tun = netdev_priv(dev); dev_get_tstats64(dev, stats); stats->rx_frame_errors += (unsigned long)atomic_long_read(&tun->rx_frame_errors); } static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; struct bpf_prog *old_prog; int i; old_prog = rtnl_dereference(tun->xdp_prog); rcu_assign_pointer(tun->xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } list_for_each_entry(tfile, &tun->disabled, next) { if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } return 0; } static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) { if (new_carrier) { struct tun_struct *tun = netdev_priv(dev); if (!tun->numqueues) return -EPERM; netif_carrier_on(dev); } else { netif_carrier_off(dev); } return 0; } static const struct net_device_ops tun_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_select_queue = tun_select_queue, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, .ndo_change_carrier = tun_net_change_carrier, }; static void __tun_xdp_flush_tfile(struct tun_file *tfile) { /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); } static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; u32 numqueues; int nxmit = 0; int i; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; rcu_read_lock(); resample: numqueues = READ_ONCE(tun->numqueues); if (!numqueues) { rcu_read_unlock(); return -ENXIO; /* Caller will free/return all frames */ } tfile = rcu_dereference(tun->tfiles[smp_processor_id() % numqueues]); if (unlikely(!tfile)) goto resample; spin_lock(&tfile->tx_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *xdp = frames[i]; /* Encode the XDP flag into lowest bit for consumer to differ * XDP buffer from sk_buff. */ void *frame = tun_xdp_to_ptr(xdp); if (__ptr_ring_produce(&tfile->tx_ring, frame)) { dev_core_stats_tx_dropped_inc(dev); break; } nxmit++; } spin_unlock(&tfile->tx_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __tun_xdp_flush_tfile(tfile); rcu_read_unlock(); return nxmit; } static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); int nxmit; if (unlikely(!frame)) return -EOVERFLOW; nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH); if (!nxmit) xdp_return_frame_rx_napi(frame); return nxmit; } static const struct net_device_ops tap_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_set_rx_mode = tun_net_mclist, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_select_queue = tun_select_queue, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_bpf = tun_xdp, .ndo_xdp_xmit = tun_xdp_xmit, .ndo_change_carrier = tun_net_change_carrier, }; static void tun_flow_init(struct tun_struct *tun) { int i; for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) INIT_HLIST_HEAD(&tun->flows[i]); tun->ageing_time = TUN_FLOW_EXPIRE; timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0); mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + tun->ageing_time)); } static void tun_flow_uninit(struct tun_struct *tun) { del_timer_sync(&tun->flow_gc_timer); tun_flow_flush(tun); } #define MIN_MTU 68 #define MAX_MTU 65535 /* Initialize net device. */ static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; /* Point-to-Point TUN Device */ dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = 1500; /* Zero header length */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; break; case IFF_TAP: dev->netdev_ops = &tap_netdev_ops; /* Ethernet TAP Device */ ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; eth_hw_addr_random(dev); /* Currently tun does not support XDP, only tap does. */ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_NDO_XMIT; break; } dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU - dev->hard_header_len; } static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile) { struct sock *sk = tfile->socket.sk; return (tun->dev->flags & IFF_UP) && sock_writeable(sk); } /* Character device part */ /* Poll */ static __poll_t tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); struct sock *sk; __poll_t mask = 0; if (!tun) return EPOLLERR; sk = tfile->socket.sk; poll_wait(file, sk_sleep(sk), wait); if (!ptr_ring_empty(&tfile->tx_ring)) mask |= EPOLLIN | EPOLLRDNORM; /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to * guarantee EPOLLOUT to be raised by either here or * tun_sock_write_space(). Then process could get notification * after it writes to a down device and meets -EIO. */ if (tun_sock_writeable(tun, tfile) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && tun_sock_writeable(tun, tfile))) mask |= EPOLLOUT | EPOLLWRNORM; if (tun->dev->reg_state != NETREG_REGISTERED) mask = EPOLLERR; tun_put(tun); return mask; } static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, size_t len, const struct iov_iter *it) { struct sk_buff *skb; size_t linear; int err; int i; if (it->nr_segs > MAX_SKB_FRAGS + 1 || len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN)) return ERR_PTR(-EMSGSIZE); local_bh_disable(); skb = napi_get_frags(&tfile->napi); local_bh_enable(); if (!skb) return ERR_PTR(-ENOMEM); linear = iov_iter_single_seg_count(it); err = __skb_grow(skb, linear); if (err) goto free; skb->len = len; skb->data_len = len - linear; skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { const struct iovec *iov = iter_iov(it); size_t fragsz = iov->iov_len; struct page *page; void *frag; if (fragsz == 0 || fragsz > PAGE_SIZE) { err = -EINVAL; goto free; } frag = netdev_alloc_frag(fragsz); if (!frag) { err = -ENOMEM; goto free; } page = virt_to_head_page(frag); skb_fill_page_desc(skb, i - 1, page, frag - page_address(page), fragsz); } return skb; free: /* frees skb and all frags allocated with napi_alloc_frag() */ napi_free_frags(&tfile->napi); return ERR_PTR(err); } /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, size_t prepad, size_t len, size_t linear, int noblock) { struct sock *sk = tfile->socket.sk; struct sk_buff *skb; int err; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, &err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return ERR_PTR(err); skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, int more) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; u32 rx_batched = tun->rx_batched; bool rcv = false; if (!rx_batched || (!more && skb_queue_empty(queue))) { local_bh_disable(); skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); return; } spin_lock(&queue->lock); if (!more || skb_queue_len(queue) == rx_batched) { __skb_queue_head_init(&process_queue); skb_queue_splice_tail_init(queue, &process_queue); rcv = true; } else { __skb_queue_tail(queue, skb); } spin_unlock(&queue->lock); if (rcv) { struct sk_buff *nskb; local_bh_disable(); while ((nskb = __skb_dequeue(&process_queue))) { skb_record_rx_queue(nskb, tfile->queue_index); netif_receive_skb(nskb); } skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); } } static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, int len, int noblock, bool zerocopy) { if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) return false; if (tfile->socket.sk->sk_sndbuf != INT_MAX) return false; if (!noblock) return false; if (zerocopy) return false; if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return false; return true; } static struct sk_buff *__tun_build_skb(struct tun_file *tfile, struct page_frag *alloc_frag, char *buf, int buflen, int len, int pad) { struct sk_buff *skb = build_skb(buf, buflen); if (!skb) return ERR_PTR(-ENOMEM); skb_reserve(skb, pad); skb_put(skb, len); skb_set_owner_w(skb, tfile->socket.sk); get_page(alloc_frag->page); alloc_frag->offset += buflen; return skb; } static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, struct xdp_buff *xdp, u32 act) { int err; switch (act) { case XDP_REDIRECT: err = xdp_do_redirect(tun->dev, xdp, xdp_prog); if (err) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_TX: err = tun_xdp_tx(tun->dev, xdp); if (err < 0) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_PASS: break; default: bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tun->dev, xdp_prog, act); fallthrough; case XDP_DROP: dev_core_stats_rx_dropped_inc(tun->dev); break; } return act; } static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *from, struct virtio_net_hdr *hdr, int len, int *skb_xdp) { struct page_frag *alloc_frag = &current->task_frag; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); char *buf; size_t copied; int pad = TUN_RX_PAD; int err = 0; rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) pad += XDP_PACKET_HEADROOM; buflen += SKB_DATA_ALIGN(len + pad); rcu_read_unlock(); alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) return ERR_PTR(-ENOMEM); buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + pad, len, from); if (copied != len) return ERR_PTR(-EFAULT); /* There's a small window that XDP may be set after the check * of xdp_prog above, this should be rare and for simplicity * we do XDP on skb in case the headroom is not enough. */ if (hdr->gso_type || !xdp_prog) { *skb_xdp = 1; return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); } *skb_xdp = 0; local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { struct xdp_buff xdp; u32 act; xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); xdp_prepare_buff(&xdp, buf, pad, len, false); act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { get_page(alloc_frag->page); alloc_frag->offset += buflen; } err = tun_xdp_act(tun, xdp_prog, &xdp, act); if (err < 0) { if (act == XDP_REDIRECT || act == XDP_TX) put_page(alloc_frag->page); goto out; } if (err == XDP_REDIRECT) xdp_do_flush(); if (err != XDP_PASS) goto out; pad = xdp.data - xdp.data_hard_start; len = xdp.data_end - xdp.data; } bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); out: bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return NULL; } /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, int noblock, bool more) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr gso = { 0 }; int good_linear; int copylen; bool zerocopy = false; int err; u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) return -EINVAL; len -= sizeof(pi); if (!copy_from_iter_full(&pi, sizeof(pi), from)) return -EFAULT; } if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); if (len < vnet_hdr_sz) return -EINVAL; len -= vnet_hdr_sz; if (!copy_from_iter_full(&gso, sizeof(gso), from)) return -EFAULT; if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len)) gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2); if (tun16_to_cpu(tun, gso.hdr_len) > len) return -EINVAL; iov_iter_advance(from, vnet_hdr_sz - sizeof(gso)); } if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN))) return -EINVAL; } good_linear = SKB_MAX_HEAD(align); if (msg_control) { struct iov_iter i = *from; /* There are 256 bytes to be copied in skb, so there is * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN; if (copylen > good_linear) copylen = good_linear; linear = copylen; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. */ skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp); err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (!skb) return total_len; } else { if (!zerocopy) { copylen = len; if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) linear = good_linear; else linear = tun16_to_cpu(tun, gso.hdr_len); } if (frags) { mutex_lock(&tfile->napi_mutex); skb = tun_napi_alloc_frags(tfile, copylen, from); /* tun_napi_alloc_frags() enforces a layout for the skb. * If zerocopy is enabled, then this layout will be * overwritten by zerocopy_sg_from_iter(). */ zerocopy = false; } else { if (!linear) linear = min_t(size_t, good_linear, copylen); skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); } err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { err = -EFAULT; drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } } if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; } switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; switch (ip_version) { case 4: pi.proto = htons(ETH_P_IP); break; case 6: pi.proto = htons(ETH_P_IPV6); break; default: err = -EINVAL; goto drop; } } skb_reset_mac_header(skb); skb->protocol = pi.proto; skb->dev = tun->dev; break; case IFF_TAP: if (frags && !pskb_may_pull(skb, ETH_HLEN)) { err = -ENOMEM; drop_reason = SKB_DROP_REASON_HDR_TRUNC; goto drop; } skb->protocol = eth_type_trans(skb, tun->dev); break; } /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->ops->complete(NULL, uarg, false); } skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { struct bpf_prog *xdp_prog; int ret; local_bh_disable(); rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { rcu_read_unlock(); local_bh_enable(); goto unlock_frags; } } rcu_read_unlock(); local_bh_enable(); } /* Compute the costly rx hash only if needed for flow updates. * We may get a very small possibility of OOO during switching, not * worth to optimize. */ if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); rcu_read_lock(); if (unlikely(!(tun->dev->flags & IFF_UP))) { err = -EIO; rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (frags) { u32 headlen; /* Exercise flow dissector code path. */ skb_push(skb, ETH_HLEN); headlen = eth_get_headlen(tun->dev, skb->data, skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { WARN_ON_ONCE(1); err = -ENOMEM; dev_core_stats_rx_dropped_inc(tun->dev); napi_busy: napi_free_frags(&tfile->napi); rcu_read_unlock(); mutex_unlock(&tfile->napi_mutex); return err; } if (likely(napi_schedule_prep(&tfile->napi))) { local_bh_disable(); napi_gro_frags(&tfile->napi); napi_complete(&tfile->napi); local_bh_enable(); } else { err = -EBUSY; goto napi_busy; } mutex_unlock(&tfile->napi_mutex); } else if (tfile->napi_enabled) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; spin_lock_bh(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock_bh(&queue->lock); rcu_read_unlock(); err = -EBUSY; goto free_skb; } __skb_queue_tail(queue, skb); queue_len = skb_queue_len(queue); spin_unlock(&queue->lock); if (!more || queue_len > NAPI_POLL_WEIGHT) napi_schedule(&tfile->napi); local_bh_enable(); } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { tun_rx_batched(tun, tfile, skb, more); } else { netif_rx(skb); } rcu_read_unlock(); preempt_disable(); dev_sw_netstats_rx_add(tun->dev, len); preempt_enable(); if (rxhash) tun_flow_update(tun, rxhash, tfile); return total_len; drop: if (err != -EAGAIN) dev_core_stats_rx_dropped_inc(tun->dev); free_skb: if (!IS_ERR_OR_NULL(skb)) kfree_skb_reason(skb, drop_reason); unlock_frags: if (frags) { tfile->napi.skb = NULL; mutex_unlock(&tfile->napi_mutex); } return err ?: total_len; } static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t result; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; result = tun_get_user(tun, tfile, NULL, from, noblock, false); tun_put(tun); return result; } static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct tun_file *tfile, struct xdp_frame *xdp_frame, struct iov_iter *iter) { int vnet_hdr_sz = 0; size_t size = xdp_frame->len; size_t ret; if (tun->flags & IFF_VNET_HDR) { struct virtio_net_hdr gso = { 0 }; vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); if (unlikely(iov_iter_count(iter) < vnet_hdr_sz)) return -EINVAL; if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))) return -EFAULT; iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, ret); preempt_enable(); return ret; } /* Put packet to the user space buffer */ static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; ssize_t total; int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; if (tun->flags & IFF_VNET_HDR) vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); total = skb->len + vlan_hlen + vnet_hdr_sz; if (!(tun->flags & IFF_NO_PI)) { if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; total += sizeof(pi); if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT; } if (vnet_hdr_sz) { struct virtio_net_hdr gso; if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; if (virtio_net_hdr_from_skb(skb, &gso, tun_is_little_endian(tun), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); if (net_ratelimit()) { netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), tun16_to_cpu(tun, gso.hdr_len)); print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE, 16, 1, skb->head, min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); } WARN_ON_ONCE(1); return -EINVAL; } if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) return -EFAULT; iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } if (vlan_hlen) { int ret; struct veth veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: /* caller is in process context, */ preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen); preempt_enable(); return total; } static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) { DECLARE_WAITQUEUE(wait, current); void *ptr = NULL; int error = 0; ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) goto out; if (noblock) { error = -EAGAIN; goto out; } add_wait_queue(&tfile->socket.wq.wait, &wait); while (1) { set_current_state(TASK_INTERRUPTIBLE); ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) break; if (signal_pending(current)) { error = -ERESTARTSYS; break; } if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) { error = -EFAULT; break; } schedule(); } __set_current_state(TASK_RUNNING); remove_wait_queue(&tfile->socket.wq.wait, &wait); out: *err = error; return ptr; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, int noblock, void *ptr) { ssize_t ret; int err; if (!iov_iter_count(to)) { tun_ptr_free(ptr); return 0; } if (!ptr) { /* Read frames from ring */ ptr = tun_ring_recv(tfile, noblock, &err); if (!ptr) return err; } if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); ret = tun_put_user_xdp(tun, tfile, xdpf, to); xdp_return_frame(xdpf); } else { struct sk_buff *skb = ptr; ret = tun_put_user(tun, tfile, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t len = iov_iter_count(to), ret; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tun_do_read(tun, tfile, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; tun_put(tun); return ret; } static void tun_prog_free(struct rcu_head *rcu) { struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu); bpf_prog_destroy(prog->prog); kfree(prog); } static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, struct bpf_prog *prog) { struct tun_prog *old, *new = NULL; if (prog) { new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; new->prog = prog; } spin_lock_bh(&tun->lock); old = rcu_dereference_protected(*prog_p, lockdep_is_held(&tun->lock)); rcu_assign_pointer(*prog_p, new); spin_unlock_bh(&tun->lock); if (old) call_rcu(&old->rcu, tun_prog_free); return 0; } static void tun_free_netdev(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); __tun_set_ebpf(tun, &tun->filter_prog, NULL); } static void tun_setup(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); tun->owner = INVALID_UID; tun->group = INVALID_GID; tun_default_link_ksettings(dev, &tun->link_ksettings); dev->ethtool_ops = &tun_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = tun_free_netdev; /* We prefer our own queue length */ dev->tx_queue_len = TUN_READQ_SIZE; } /* Trivial set of netlink ops to allow deleting tun or tap * device with netlink. */ static int tun_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "tun/tap creation via rtnetlink is not supported."); return -EOPNOTSUPP; } static size_t tun_get_size(const struct net_device *dev) { BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t)); BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t)); return nla_total_size(sizeof(uid_t)) + /* OWNER */ nla_total_size(sizeof(gid_t)) + /* GROUP */ nla_total_size(sizeof(u8)) + /* TYPE */ nla_total_size(sizeof(u8)) + /* PI */ nla_total_size(sizeof(u8)) + /* VNET_HDR */ nla_total_size(sizeof(u8)) + /* PERSIST */ nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */ nla_total_size(sizeof(u32)) + /* NUM_QUEUES */ nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */ 0; } static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK)) goto nla_put_failure; if (uid_valid(tun->owner) && nla_put_u32(skb, IFLA_TUN_OWNER, from_kuid_munged(current_user_ns(), tun->owner))) goto nla_put_failure; if (gid_valid(tun->group) && nla_put_u32(skb, IFLA_TUN_GROUP, from_kgid_munged(current_user_ns(), tun->group))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE, !!(tun->flags & IFF_MULTI_QUEUE))) goto nla_put_failure; if (tun->flags & IFF_MULTI_QUEUE) { if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES, tun->numdisabled)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops tun_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct tun_struct), .setup = tun_setup, .validate = tun_validate, .get_size = tun_get_size, .fill_info = tun_fill_info, }; static void tun_sock_write_space(struct sock *sk) { struct tun_file *tfile; wait_queue_head_t *wqueue; if (!sock_writeable(sk)) return; if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_sync_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); tfile = container_of(sk, struct tun_file, sk); kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } static void tun_put_page(struct tun_page *tpage) { if (tpage->page) __page_frag_cache_drain(tpage->page, tpage->count); } static int tun_xdp_one(struct tun_struct *tun, struct tun_file *tfile, struct xdp_buff *xdp, int *flush, struct tun_page *tpage) { unsigned int datasize = xdp->data_end - xdp->data; struct tun_xdp_hdr *hdr = xdp->data_hard_start; struct virtio_net_hdr *gso = &hdr->gso; struct bpf_prog *xdp_prog; struct sk_buff *skb = NULL; struct sk_buff_head *queue; u32 rxhash = 0, act; int buflen = hdr->buflen; int ret = 0; bool skb_xdp = false; struct page *page; if (unlikely(datasize < ETH_HLEN)) return -EINVAL; xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { if (gso->gso_type) { skb_xdp = true; goto build; } xdp_init_buff(xdp, buflen, &tfile->xdp_rxq); xdp_set_data_meta_invalid(xdp); act = bpf_prog_run_xdp(xdp_prog, xdp); ret = tun_xdp_act(tun, xdp_prog, xdp, act); if (ret < 0) { put_page(virt_to_head_page(xdp->data)); return ret; } switch (ret) { case XDP_REDIRECT: *flush = true; fallthrough; case XDP_TX: return 0; case XDP_PASS: break; default: page = virt_to_head_page(xdp->data); if (tpage->page == page) { ++tpage->count; } else { tun_put_page(tpage); tpage->page = page; tpage->count = 1; } return 0; } } build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { ret = -ENOMEM; goto out; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; goto out; } skb->protocol = eth_type_trans(skb, tun->dev); skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { ret = 0; goto out; } } if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); if (tfile->napi_enabled) { queue = &tfile->sk.sk_write_queue; spin_lock(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock(&queue->lock); kfree_skb(skb); return -EBUSY; } __skb_queue_tail(queue, skb); spin_unlock(&queue->lock); ret = 1; } else { netif_receive_skb(skb); ret = 0; } /* No need to disable preemption here since this function is * always called with bh disabled */ dev_sw_netstats_rx_add(tun->dev, datasize); if (rxhash) tun_flow_update(tun, rxhash, tfile); out: return ret; } static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret, i; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; if (!tun) return -EBADFD; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct tun_page tpage; int n = ctl->num; int flush = 0, queued = 0; memset(&tpage, 0, sizeof(tpage)); local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); for (i = 0; i < n; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage); if (ret > 0) queued += ret; } if (flush) xdp_do_flush(); if (tfile->napi_enabled && queued > 0) napi_schedule(&tfile->napi); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); tun_put_page(&tpage); ret = total_len; goto out; } ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); out: tun_put(tun); return ret; } static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); void *ptr = m->msg_control; int ret; if (!tun) { ret = -EBADFD; goto out_free; } if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { ret = -EINVAL; goto out_put_tun; } if (flags & MSG_ERRQUEUE) { ret = sock_recv_errqueue(sock->sk, m, total_len, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr); if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } out: tun_put(tun); return ret; out_put_tun: tun_put(tun); out_free: tun_ptr_free(ptr); return ret; } static int tun_ptr_peek_len(void *ptr) { if (likely(ptr)) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } else { return 0; } } static int tun_peek_len(struct socket *sock) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun; int ret = 0; tun = tun_get(tfile); if (!tun) return 0; ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); tun_put(tun); return ret; } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tun_socket_ops = { .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, }; static struct proto tun_proto = { .name = "tun", .owner = THIS_MODULE, .obj_size = sizeof(struct tun_file), }; static int tun_flags(struct tun_struct *tun) { return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP); } static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return sysfs_emit(buf, "0x%x\n", tun_flags(tun)); } static ssize_t owner_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return uid_valid(tun->owner)? sysfs_emit(buf, "%u\n", from_kuid_munged(current_user_ns(), tun->owner)) : sysfs_emit(buf, "-1\n"); } static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return gid_valid(tun->group) ? sysfs_emit(buf, "%u\n", from_kgid_munged(current_user_ns(), tun->group)) : sysfs_emit(buf, "-1\n"); } static DEVICE_ATTR_RO(tun_flags); static DEVICE_ATTR_RO(owner); static DEVICE_ATTR_RO(group); static struct attribute *tun_dev_attrs[] = { &dev_attr_tun_flags.attr, &dev_attr_owner.attr, &dev_attr_group.attr, NULL }; static const struct attribute_group tun_attr_group = { .attrs = tun_dev_attrs }; static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { struct tun_struct *tun; struct tun_file *tfile = file->private_data; struct net_device *dev; int err; if (tfile->detached) return -EINVAL; if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!(ifr->ifr_flags & IFF_NAPI) || (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) return -EINVAL; } dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) return -EBUSY; if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops) tun = netdev_priv(dev); else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops) tun = netdev_priv(dev); else return -EINVAL; if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) != !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; if (tun_not_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) return err; err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, true); if (err < 0) return err; if (tun->flags & IFF_MULTI_QUEUE && (tun->numqueues + tun->numdisabled > 1)) { /* One or more queue has already been attached, no need * to initialize the device again. */ netdev_state_change(dev); return 0; } tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); netdev_state_change(dev); } else { char *name; unsigned long flags = 0; int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = security_tun_dev_create(); if (err < 0) return err; /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ flags |= IFF_TUN; name = "tun%d"; } else if (ifr->ifr_flags & IFF_TAP) { /* TAP device */ flags |= IFF_TAP; name = "tap%d"; } else return -EINVAL; if (*ifr->ifr_name) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, NET_NAME_UNKNOWN, tun_setup, queues, queues); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; dev->ifindex = tfile->ifindex; dev->sysfs_groups[0] = &tun_attr_group; tun = netdev_priv(dev); tun->dev = dev; tun->flags = flags; tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL); tun->ifr = ifr; tun->file = file; tun_net_initialize(dev); err = register_netdevice(tun->dev); if (err < 0) { free_netdev(dev); return err; } /* free_netdev() won't check refcnt, to avoid race * with dev_put() we need publish tun after registration. */ rcu_assign_pointer(tfile->tun, tun); } if (ifr->ifr_flags & IFF_NO_CARRIER) netif_carrier_off(tun->dev); else netif_carrier_on(tun->dev); /* Make sure persistent devices do not get stuck in * xoff state. */ if (netif_running(tun->dev)) netif_tx_wake_all_queues(tun->dev); strcpy(ifr->ifr_name, tun->dev->name); return 0; } static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr) { strcpy(ifr->ifr_name, tun->dev->name); ifr->ifr_flags = tun_flags(tun); } /* This is like a cut-down ethtool ops, except done via tun fd so no * privs required. */ static int set_offload(struct tun_struct *tun, unsigned long arg) { netdev_features_t features = 0; if (arg & TUN_F_CSUM) { features |= NETIF_F_HW_CSUM; arg &= ~TUN_F_CSUM; if (arg & (TUN_F_TSO4|TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) { features |= NETIF_F_TSO_ECN; arg &= ~TUN_F_TSO_ECN; } if (arg & TUN_F_TSO4) features |= NETIF_F_TSO; if (arg & TUN_F_TSO6) features |= NETIF_F_TSO6; arg &= ~(TUN_F_TSO4|TUN_F_TSO6); } arg &= ~TUN_F_UFO; /* TODO: for now USO4 and USO6 should work simultaneously */ if (arg & TUN_F_USO4 && arg & TUN_F_USO6) { features |= NETIF_F_GSO_UDP_L4; arg &= ~(TUN_F_USO4 | TUN_F_USO6); } } /* This gives the user a way to test for new features in future by * trying to set them. */ if (arg) return -EINVAL; tun->set_features = features; tun->dev->wanted_features &= ~TUN_USER_FEATURES; tun->dev->wanted_features |= features; netdev_update_features(tun->dev); return 0; } static void tun_detach_filter(struct tun_struct *tun, int n) { int i; struct tun_file *tfile; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); sk_detach_filter(tfile->socket.sk); release_sock(tfile->socket.sk); } tun->filter_attached = false; } static int tun_attach_filter(struct tun_struct *tun) { int i, ret = 0; struct tun_file *tfile; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (ret) { tun_detach_filter(tun, i); return ret; } } tun->filter_attached = true; return ret; } static void tun_set_sndbuf(struct tun_struct *tun) { struct tun_file *tfile; int i; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_sndbuf = tun->sndbuf; } } static int tun_set_queue(struct file *file, struct ifreq *ifr) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; int ret = 0; rtnl_lock(); if (ifr->ifr_flags & IFF_ATTACH_QUEUE) { tun = tfile->detached; if (!tun) { ret = -EINVAL; goto unlock; } ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI, tun->flags & IFF_NAPI_FRAGS, true); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) ret = -EINVAL; else __tun_detach(tfile, false); } else ret = -EINVAL; if (ret >= 0) netdev_state_change(tun->dev); unlock: rtnl_unlock(); return ret; } static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, void __user *data) { struct bpf_prog *prog; int fd; if (copy_from_user(&fd, data, sizeof(fd))) return -EFAULT; if (fd == -1) { prog = NULL; } else { prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) return PTR_ERR(prog); } return __tun_set_ebpf(tun, prog_p, prog); } /* Return correct value for tun->dev->addr_len based on tun->dev->type. */ static unsigned char tun_get_addr_len(unsigned short type) { switch (type) { case ARPHRD_IP6GRE: case ARPHRD_TUNNEL6: return sizeof(struct in6_addr); case ARPHRD_IPGRE: case ARPHRD_TUNNEL: case ARPHRD_SIT: return 4; case ARPHRD_ETHER: return ETH_ALEN; case ARPHRD_IEEE802154: case ARPHRD_IEEE802154_MONITOR: return IEEE802154_EXTENDED_ADDR_LEN; case ARPHRD_PHONET_PIPE: case ARPHRD_PPP: case ARPHRD_NONE: return 0; case ARPHRD_6LOWPAN: return EUI64_ADDR_LEN; case ARPHRD_FDDI: return FDDI_K_ALEN; case ARPHRD_HIPPI: return HIPPI_ALEN; case ARPHRD_IEEE802: return FC_ALEN; case ARPHRD_ROSE: return ROSE_ADDR_LEN; case ARPHRD_NETROM: return AX25_ADDR_LEN; case ARPHRD_LOCALTLK: return LTALK_ALEN; default: return 0; } } static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { struct tun_file *tfile = file->private_data; struct net *net = sock_net(&tfile->sk); struct tun_struct *tun; void __user* argp = (void __user*)arg; unsigned int carrier; struct ifreq ifr; kuid_t owner; kgid_t group; int ifindex; int sndbuf; int vnet_hdr_sz; int le; int ret; bool do_notify = false; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; } else { memset(&ifr, 0, sizeof(ifr)); } if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on * TUNSETIFF. */ return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) { return tun_set_queue(file, &ifr); } else if (cmd == SIOCGSKNS) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; return open_related_ns(&net->ns, get_net_ns); } rtnl_lock(); tun = tun_get(tfile); if (cmd == TUNSETIFF) { ret = -EEXIST; if (tun) goto unlock; ifr.ifr_name[IFNAMSIZ-1] = '\0'; ret = tun_set_iff(net, file, &ifr); if (ret) goto unlock; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; goto unlock; } if (cmd == TUNSETIFINDEX) { ret = -EPERM; if (tun) goto unlock; ret = -EFAULT; if (copy_from_user(&ifindex, argp, sizeof(ifindex))) goto unlock; ret = -EINVAL; if (ifindex < 0) goto unlock; ret = 0; tfile->ifindex = ifindex; goto unlock; } ret = -EBADFD; if (!tun) goto unlock; netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd); net = dev_net(tun->dev); ret = 0; switch (cmd) { case TUNGETIFF: tun_get_iff(tun, &ifr); if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; if (!tfile->socket.sk->sk_filter) ifr.ifr_flags |= IFF_NOFILTER; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case TUNSETNOCSUM: /* Disable/Enable checksum */ /* [unimplemented] */ netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n", arg ? "disabled" : "enabled"); break; case TUNSETPERSIST: /* Disable/Enable persist mode. Keep an extra reference to the * module to prevent the module being unprobed. */ if (arg && !(tun->flags & IFF_PERSIST)) { tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); do_notify = true; } if (!arg && (tun->flags & IFF_PERSIST)) { tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); do_notify = true; } netif_info(tun, drv, tun->dev, "persist %s\n", arg ? "enabled" : "disabled"); break; case TUNSETOWNER: /* Set owner of the device */ owner = make_kuid(current_user_ns(), arg); if (!uid_valid(owner)) { ret = -EINVAL; break; } tun->owner = owner; do_notify = true; netif_info(tun, drv, tun->dev, "owner set to %u\n", from_kuid(&init_user_ns, tun->owner)); break; case TUNSETGROUP: /* Set group of the device */ group = make_kgid(current_user_ns(), arg); if (!gid_valid(group)) { ret = -EINVAL; break; } tun->group = group; do_notify = true; netif_info(tun, drv, tun->dev, "group set to %u\n", from_kgid(&init_user_ns, tun->group)); break; case TUNSETLINK: /* Only allow setting the type when the interface is down */ if (tun->dev->flags & IFF_UP) { netif_info(tun, drv, tun->dev, "Linktype set failed because interface is up\n"); ret = -EBUSY; } else { ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, tun->dev); ret = notifier_to_errno(ret); if (ret) { netif_info(tun, drv, tun->dev, "Refused to change device type\n"); break; } tun->dev->type = (int) arg; tun->dev->addr_len = tun_get_addr_len(tun->dev->type); netif_info(tun, drv, tun->dev, "linktype set to %d\n", tun->dev->type); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, tun->dev); } break; case TUNSETDEBUG: tun->msg_enable = (u32)arg; break; case TUNSETOFFLOAD: ret = set_offload(tun, arg); break; case TUNSETTXFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = update_filter(&tun->txflt, (void __user *)arg); break; case SIOCGIFHWADDR: /* Get hw address */ dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name); if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case SIOCSIFHWADDR: /* Set hw address */ ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL); break; case TUNGETSNDBUF: sndbuf = tfile->socket.sk->sk_sndbuf; if (copy_to_user(argp, &sndbuf, sizeof(sndbuf))) ret = -EFAULT; break; case TUNSETSNDBUF: if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) { ret = -EFAULT; break; } if (sndbuf <= 0) { ret = -EINVAL; break; } tun->sndbuf = sndbuf; tun_set_sndbuf(tun); break; case TUNGETVNETHDRSZ: vnet_hdr_sz = tun->vnet_hdr_sz; if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) ret = -EFAULT; break; case TUNSETVNETHDRSZ: if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) { ret = -EFAULT; break; } if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) { ret = -EINVAL; break; } tun->vnet_hdr_sz = vnet_hdr_sz; break; case TUNGETVNETLE: le = !!(tun->flags & TUN_VNET_LE); if (put_user(le, (int __user *)argp)) ret = -EFAULT; break; case TUNSETVNETLE: if (get_user(le, (int __user *)argp)) { ret = -EFAULT; break; } if (le) tun->flags |= TUN_VNET_LE; else tun->flags &= ~TUN_VNET_LE; break; case TUNGETVNETBE: ret = tun_get_vnet_be(tun, argp); break; case TUNSETVNETBE: ret = tun_set_vnet_be(tun, argp); break; case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) break; ret = tun_attach_filter(tun); break; case TUNDETACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = 0; tun_detach_filter(tun, tun->numqueues); break; case TUNGETFILTER: ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) break; ret = 0; break; case TUNSETSTEERINGEBPF: ret = tun_set_ebpf(tun, &tun->steering_prog, argp); break; case TUNSETFILTEREBPF: ret = tun_set_ebpf(tun, &tun->filter_prog, argp); break; case TUNSETCARRIER: ret = -EFAULT; if (copy_from_user(&carrier, argp, sizeof(carrier))) goto unlock; ret = tun_net_change_carrier(tun->dev, (bool)carrier); break; case TUNGETDEVNETNS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto unlock; ret = open_related_ns(&net->ns, get_net_ns); break; default: ret = -EINVAL; break; } if (do_notify) netdev_state_change(tun->dev); unlock: rtnl_unlock(); if (tun) tun_put(tun); return ret; } static long tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq)); } #ifdef CONFIG_COMPAT static long tun_chr_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case TUNSETIFF: case TUNGETIFF: case TUNSETTXFILTER: case TUNGETSNDBUF: case TUNSETSNDBUF: case SIOCGIFHWADDR: case SIOCSIFHWADDR: arg = (unsigned long)compat_ptr(arg); break; default: arg = (compat_ulong_t)arg; break; } /* * compat_ifreq is shorter than ifreq, so we must not access beyond * the end of that structure. All fields that are used in this * driver are compatible though, we don't need to convert the * contents. */ return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq)); } #endif /* CONFIG_COMPAT */ static int tun_chr_fasync(int fd, struct file *file, int on) { struct tun_file *tfile = file->private_data; int ret; if (on) { ret = file_f_owner_allocate(file); if (ret) goto out; } if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; if (on) { __f_setown(file, task_pid(current), PIDTYPE_TGID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; ret = 0; out: return ret; } static int tun_chr_open(struct inode *inode, struct file * file) { struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); if (!tfile) return -ENOMEM; if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) { sk_free(&tfile->sk); return -ENOMEM; } mutex_init(&tfile->napi_mutex); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; init_waitqueue_head(&tfile->socket.wq.wait); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid()); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); /* tun groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; return 0; } static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; tun_detach(tfile, true); return 0; } #ifdef CONFIG_PROC_FS static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); rtnl_lock(); tun = tun_get(tfile); if (tun) tun_get_iff(tun, &ifr); rtnl_unlock(); if (tun) tun_put(tun); seq_printf(m, "iff:\t%s\n", ifr.ifr_name); } #endif static const struct file_operations tun_fops = { .owner = THIS_MODULE, .read_iter = tun_chr_read_iter, .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = tun_chr_compat_ioctl, #endif .open = tun_chr_open, .release = tun_chr_close, .fasync = tun_chr_fasync, #ifdef CONFIG_PROC_FS .show_fdinfo = tun_chr_show_fdinfo, #endif }; static struct miscdevice tun_miscdev = { .minor = TUN_MINOR, .name = "tun", .nodename = "net/tun", .fops = &tun_fops, }; /* ethtool interface */ static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { ethtool_link_ksettings_zero_link_mode(cmd, supported); ethtool_link_ksettings_zero_link_mode(cmd, advertising); cmd->base.speed = SPEED_10000; cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_TP; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_DISABLE; } static int tun_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(cmd, &tun->link_ksettings, sizeof(*cmd)); return 0; } static int tun_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(&tun->link_ksettings, cmd, sizeof(*cmd)); return 0; } static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct tun_struct *tun = netdev_priv(dev); strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: strscpy(info->bus_info, "tun", sizeof(info->bus_info)); break; case IFF_TAP: strscpy(info->bus_info, "tap", sizeof(info->bus_info)); break; } } static u32 tun_get_msglevel(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); return tun->msg_enable; } static void tun_set_msglevel(struct net_device *dev, u32 value) { struct tun_struct *tun = netdev_priv(dev); tun->msg_enable = value; } static int tun_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); ec->rx_max_coalesced_frames = tun->rx_batched; return 0; } static int tun_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT) tun->rx_batched = NAPI_POLL_WEIGHT; else tun->rx_batched = ec->rx_max_coalesced_frames; return 0; } static void tun_get_channels(struct net_device *dev, struct ethtool_channels *channels) { struct tun_struct *tun = netdev_priv(dev); channels->combined_count = tun->numqueues; channels->max_combined = tun->flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; } static const struct ethtool_ops tun_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES, .get_drvinfo = tun_get_drvinfo, .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, .get_link = ethtool_op_get_link, .get_channels = tun_get_channels, .get_ts_info = ethtool_op_get_ts_info, .get_coalesce = tun_get_coalesce, .set_coalesce = tun_set_coalesce, .get_link_ksettings = tun_get_link_ksettings, .set_link_ksettings = tun_set_link_ksettings, }; static int tun_queue_resize(struct tun_struct *tun) { struct net_device *dev = tun->dev; struct tun_file *tfile; struct ptr_ring **rings; int n = tun->numqueues + tun->numdisabled; int ret, i; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); rings[i] = &tfile->tx_ring; } list_for_each_entry(tfile, &tun->disabled, next) rings[i++] = &tfile->tx_ring; ret = ptr_ring_resize_multiple(rings, n, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free); kfree(rings); return ret; } static int tun_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tun_struct *tun = netdev_priv(dev); int i; if (dev->rtnl_link_ops != &tun_link_ops) return NOTIFY_DONE; switch (event) { case NETDEV_CHANGE_TX_QUEUE_LEN: if (tun_queue_resize(tun)) return NOTIFY_BAD; break; case NETDEV_UP: for (i = 0; i < tun->numqueues; i++) { struct tun_file *tfile; tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_write_space(tfile->socket.sk); } break; default: break; } return NOTIFY_DONE; } static struct notifier_block tun_notifier_block __read_mostly = { .notifier_call = tun_device_event, }; static int __init tun_init(void) { int ret = 0; pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION); ret = rtnl_link_register(&tun_link_ops); if (ret) { pr_err("Can't register link_ops\n"); goto err_linkops; } ret = misc_register(&tun_miscdev); if (ret) { pr_err("Can't register misc device %d\n", TUN_MINOR); goto err_misc; } ret = register_netdevice_notifier(&tun_notifier_block); if (ret) { pr_err("Can't register netdevice notifier\n"); goto err_notifier; } return 0; err_notifier: misc_deregister(&tun_miscdev); err_misc: rtnl_link_unregister(&tun_link_ops); err_linkops: return ret; } static void __exit tun_cleanup(void) { misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); unregister_netdevice_notifier(&tun_notifier_block); } /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tun_get_socket(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->socket; } EXPORT_SYMBOL_GPL(tun_get_socket); struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->tx_ring; } EXPORT_SYMBOL_GPL(tun_get_tx_ring); module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(TUN_MINOR); MODULE_ALIAS("devname:net/tun");
6 7 7 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0-or-later /* * SNAP data link layer. Derived from 802.2 * * Alan Cox <alan@lxorguk.ukuu.org.uk>, * from the 802.2 layer by Greg Page. * Merged in additions from Greg Page's psnap.c. */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/datalink.h> #include <net/llc.h> #include <net/psnap.h> #include <linux/mm.h> #include <linux/in.h> #include <linux/init.h> #include <linux/rculist.h> static LIST_HEAD(snap_list); static DEFINE_SPINLOCK(snap_lock); static struct llc_sap *snap_sap; /* * Find a snap client by matching the 5 bytes. */ static struct datalink_proto *find_snap_client(const unsigned char *desc) { struct datalink_proto *proto = NULL, *p; list_for_each_entry_rcu(p, &snap_list, node, lockdep_is_held(&snap_lock)) { if (!memcmp(p->type, desc, 5)) { proto = p; break; } } return proto; } /* * A SNAP packet has arrived */ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { int rc = 1; struct datalink_proto *proto; static struct packet_type snap_packet_type = { .type = cpu_to_be16(ETH_P_SNAP), }; if (unlikely(!pskb_may_pull(skb, 5))) goto drop; rcu_read_lock(); proto = find_snap_client(skb_transport_header(skb)); if (proto) { /* Pass the frame on. */ skb->transport_header += 5; skb_pull_rcsum(skb, 5); rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } rcu_read_unlock(); if (unlikely(!proto)) goto drop; out: return rc; drop: kfree_skb(skb); goto out; } /* * Put a SNAP header on a frame and pass to 802.2 */ static int snap_request(struct datalink_proto *dl, struct sk_buff *skb, const u8 *dest) { memcpy(skb_push(skb, 5), dl->type, 5); llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap); return 0; } /* * Set up the SNAP layer */ EXPORT_SYMBOL(register_snap_client); EXPORT_SYMBOL(unregister_snap_client); static const char snap_err_msg[] __initconst = KERN_CRIT "SNAP - unable to register with 802.2\n"; static int __init snap_init(void) { snap_sap = llc_sap_open(0xAA, snap_rcv); if (!snap_sap) { printk(snap_err_msg); return -EBUSY; } return 0; } module_init(snap_init); static void __exit snap_exit(void) { llc_sap_put(snap_sap); } module_exit(snap_exit); /* * Register SNAP clients. We don't yet use this for IP. */ struct datalink_proto *register_snap_client(const unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *)) { struct datalink_proto *proto = NULL; spin_lock_bh(&snap_lock); if (find_snap_client(desc)) goto out; proto = kmalloc(sizeof(*proto), GFP_ATOMIC); if (proto) { memcpy(proto->type, desc, 5); proto->rcvfunc = rcvfunc; proto->header_length = 5 + 3; /* snap + 802.2 */ proto->request = snap_request; list_add_rcu(&proto->node, &snap_list); } out: spin_unlock_bh(&snap_lock); return proto; } /* * Unregister SNAP clients. Protocols no longer want to play with us ... */ void unregister_snap_client(struct datalink_proto *proto) { spin_lock_bh(&snap_lock); list_del_rcu(&proto->node); spin_unlock_bh(&snap_lock); synchronize_net(); kfree(proto); } MODULE_DESCRIPTION("SNAP data link layer. Derived from 802.2"); MODULE_LICENSE("GPL");
17 17 17 6 2 2 7 8 7 7 2 6 6 4 6 6 1 9 40 40 1 40 3 37 36 17 19 2 17 17 17 17 15 2 5 5 45 45 45 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 fragment reassembly * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on: net/ipv4/ip_fragment.c */ /* * Fixes: * Andi Kleen Make it work with multiple hosts. * More RFC compliance. * * Horst von Brand Add missing #include <linux/string.h> * Alexey Kuznetsov SMP races, threading, cleanup. * Patrick McHardy LRU queue of frag heads for evictor. * Mitsuru KANDA @USAGI Register inet6_protocol{}. * David Stevens and * YOSHIFUJI,H. @USAGI Always remove fragment header to * calculate ICV correctly. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/jiffies.h> #include <linux/net.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/tcp.h> #include <linux/udp.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ipv6_frag.h> #include <net/inet_ecn.h> static const char ip6_frag_cache_name[] = "ip6-frags"; static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev); static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } static struct frag_queue * fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) { struct frag_v6_compare_key key = { .id = id, .saddr = hdr->saddr, .daddr = hdr->daddr, .user = IP6_DEFRAG_LOCAL_DELIVER, .iif = iif, }; struct inet_frag_queue *q; if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL))) key.iif = 0; q = inet_frag_find(net->ipv6.fqdir, &key); if (!q) return NULL; return container_of(q, struct frag_queue, q); } static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, u32 *prob_offset) { struct net *net = dev_net(skb_dst(skb)->dev); int offset, end, fragsize; struct sk_buff *prev_tail; struct net_device *dev; int err = -ENOENT; SKB_DR(reason); u8 ecn; /* If reassembly is already done, @skb must be a duplicate frag. */ if (fq->q.flags & INET_FRAG_COMPLETE) { SKB_DR_SET(reason, DUP_FRAG); goto err; } err = -EINVAL; offset = ntohs(fhdr->frag_off) & ~0x7; end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); /* note that if prob_offset is set, the skb is freed elsewhere, * we do not free it here. */ return -1; } ecn = ip6_frag_ecn(ipv6_hdr(skb)); if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, csum_partial(nh, (u8 *)(fhdr + 1) - nh, 0)); } /* Is this the final fragment? */ if (!(fhdr->frag_off & htons(IP6_MF))) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) goto discard_fq; fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. * Required by the RFC. */ if (end & 0x7) { /* RFC2460 says always send parameter problem in * this case. -DaveM */ *prob_offset = offsetof(struct ipv6hdr, payload_len); return -1; } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) goto discard_fq; fq->q.len = end; } } if (end == offset) goto discard_fq; err = -ENOMEM; /* Point into the IP datagram 'data' part. */ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) goto discard_fq; err = pskb_trim_rcsum(skb, end - offset); if (err) goto discard_fq; /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); prev_tail = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); if (err) goto insert_error; if (dev) fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; add_frag_mem_limit(fq->q.fqdir, skb->truesize); fragsize = -skb_network_offset(skb) + skb->len; if (fragsize > fq->q.max_size) fq->q.max_size = fragsize; /* The first fragment. * nhoffset is obtained from the first fragment, of course. */ if (offset == 0) { fq->nhoffset = nhoff; fq->q.flags |= INET_FRAG_FIRST_IN; } if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = ip6_frag_reasm(fq, skb, prev_tail, dev); skb->_skb_refdst = orefdst; return err; } skb_dst_drop(skb); return -EINPROGRESS; insert_error: if (err == IPFRAG_DUP) { SKB_DR_SET(reason, DUP_FRAG); err = -EINVAL; goto err; } err = -EINVAL; __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASM_OVERLAPS); discard_fq: inet_frag_kill(&fq->q); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); return err; } /* * Check if this packet is complete. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { struct net *net = fq->q.fqdir->net; unsigned int nhoff; void *reasm_data; int payload_len; u8 ecn; inet_frag_kill(&fq->q); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) goto out_fail; reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto out_oom; payload_len = -skb_network_offset(skb) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr); if (payload_len > IPV6_MAXPLEN) goto out_oversize; /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ nhoff = fq->nhoffset; skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0]; memmove(skb->head + sizeof(struct frag_hdr), skb->head, (skb->data - skb->head) - sizeof(struct frag_hdr)); if (skb_mac_header_was_set(skb)) skb->mac_header += sizeof(struct frag_hdr); skb->network_header += sizeof(struct frag_hdr); skb_reset_transport_header(skb); inet_frag_reasm_finish(&fq->q, skb, reasm_data, true); skb->dev = dev; ipv6_hdr(skb)->payload_len = htons(payload_len); ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); IP6CB(skb)->nhoff = nhoff; IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; IP6CB(skb)->frag_max_size = fq->q.max_size; /* Yes, and fold redundant checksum back. 8) */ skb_postpush_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; return 1; out_oversize: net_dbg_ratelimited("ip6_frag_reasm: payload len = %d\n", payload_len); goto out_fail; out_oom: net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); inet_frag_kill(&fq->q); return -1; } static int ipv6_frag_rcv(struct sk_buff *skb) { struct frag_hdr *fhdr; struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); u8 nexthdr; int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ if (hdr->payload_len == 0) goto fail_hdr; if (!pskb_may_pull(skb, (skb_transport_offset(skb) + sizeof(struct frag_hdr)))) goto fail_hdr; hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); if (!(fhdr->frag_off & htons(IP6_OFFSET | IP6_MF))) { /* It is not a fragmented frame */ skb->transport_header += sizeof(struct frag_hdr); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; IP6CB(skb)->frag_max_size = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr); return 1; } /* RFC 8200, Section 4.5 Fragment Header: * If the first fragment does not include all headers through an * Upper-Layer header, then that fragment should be discarded and * an ICMP Parameter Problem, Code 3, message should be sent to * the source of the fragment, with the Pointer field set to zero. */ nexthdr = hdr->nexthdr; if (ipv6frag_thdr_truncated(skb, skb_network_offset(skb) + sizeof(struct ipv6hdr), &nexthdr)) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); return -1; } iif = skb->dev ? skb->dev->ifindex : 0; fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { u32 prob_offset = 0; int ret; spin_lock(&fq->q.lock); fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, &prob_offset); spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); if (prob_offset) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); /* icmpv6_param_prob() calls kfree_skb(skb) */ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); } return ret; } __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; fail_hdr: __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); return -1; } static const struct inet6_protocol frag_protocol = { .handler = ipv6_frag_rcv, .flags = INET6_PROTO_NOPOLICY, }; #ifdef CONFIG_SYSCTL static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ip6frag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ip6frag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; /* secret interval has been deprecated */ static int ip6_frags_secret_interval_unused; static struct ctl_table ip6_frags_ctl_table[] = { { .procname = "ip6frag_secret_interval", .data = &ip6_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; static int __net_init ip6_frags_ns_sysctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = ip6_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip6_frags_ns_ctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &net->ipv6.fqdir->high_thresh; table[0].extra1 = &net->ipv6.fqdir->low_thresh; table[1].data = &net->ipv6.fqdir->low_thresh; table[1].extra2 = &net->ipv6.fqdir->high_thresh; table[2].data = &net->ipv6.fqdir->timeout; hdr = register_net_sysctl_sz(net, "net/ipv6", table, ARRAY_SIZE(ip6_frags_ns_ctl_table)); if (!hdr) goto err_reg; net->ipv6.sysctl.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) { const struct ctl_table *table; table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } static struct ctl_table_header *ip6_ctl_header; static int ip6_frags_sysctl_register(void) { ip6_ctl_header = register_net_sysctl(&init_net, "net/ipv6", ip6_frags_ctl_table); return ip6_ctl_header == NULL ? -ENOMEM : 0; } static void ip6_frags_sysctl_unregister(void) { unregister_net_sysctl_table(ip6_ctl_header); } #else static int ip6_frags_ns_sysctl_register(struct net *net) { return 0; } static void ip6_frags_ns_sysctl_unregister(struct net *net) { } static int ip6_frags_sysctl_register(void) { return 0; } static void ip6_frags_sysctl_unregister(void) { } #endif static int __net_init ipv6_frags_init_net(struct net *net) { int res; res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net); if (res < 0) return res; net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT; res = ip6_frags_ns_sysctl_register(net); if (res < 0) fqdir_exit(net->ipv6.fqdir); return res; } static void __net_exit ipv6_frags_pre_exit_net(struct net *net) { fqdir_pre_exit(net->ipv6.fqdir); } static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); fqdir_exit(net->ipv6.fqdir); } static struct pernet_operations ip6_frags_ops = { .init = ipv6_frags_init_net, .pre_exit = ipv6_frags_pre_exit_net, .exit = ipv6_frags_exit_net, }; static const struct rhashtable_params ip6_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .hashfn = ip6frag_key_hashfn, .obj_hashfn = ip6frag_obj_hashfn, .obj_cmpfn = ip6frag_obj_cmpfn, .automatic_shrinking = true, }; int __init ipv6_frag_init(void) { int ret; ip6_frags.constructor = ip6frag_init; ip6_frags.destructor = NULL; ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.frag_expire = ip6_frag_expire; ip6_frags.frags_cache_name = ip6_frag_cache_name; ip6_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&ip6_frags); if (ret) goto out; ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); if (ret) goto err_protocol; ret = ip6_frags_sysctl_register(); if (ret) goto err_sysctl; ret = register_pernet_subsys(&ip6_frags_ops); if (ret) goto err_pernet; out: return ret; err_pernet: ip6_frags_sysctl_unregister(); err_sysctl: inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); err_protocol: inet_frags_fini(&ip6_frags); goto out; } void ipv6_frag_exit(void) { ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); inet_frags_fini(&ip6_frags); }
11 583 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/net/inet/arp.h */ #ifndef _ARP_H #define _ARP_H #include <linux/if_arp.h> #include <linux/hash.h> #include <net/neighbour.h> extern struct neigh_table arp_tbl; static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); return val * hash_rnd[0]; } #ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) key = INADDR_ANY; return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } #else static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return NULL; } #endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); neigh_confirm(n); rcu_read_unlock(); } void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw); void arp_xmit(struct sk_buff *skb); #endif /* _ARP_H */
3 3 3 8 5 5 3 3 3 3 3 3 3 3 3 3 3 3 3 5 5 5 5 5 5 5 5 5 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <trace/events/devlink.h> #include "devl_internal.h" struct devlink_stats { u64_stats_t rx_bytes; u64_stats_t rx_packets; struct u64_stats_sync syncp; }; /** * struct devlink_trap_policer_item - Packet trap policer attributes. * @policer: Immutable packet trap policer attributes. * @rate: Rate in packets / sec. * @burst: Burst size in packets. * @list: trap_policer_list member. * * Describes packet trap policer attributes. Created by devlink during trap * policer registration. */ struct devlink_trap_policer_item { const struct devlink_trap_policer *policer; u64 rate; u64 burst; struct list_head list; }; /** * struct devlink_trap_group_item - Packet trap group attributes. * @group: Immutable packet trap group attributes. * @policer_item: Associated policer item. Can be NULL. * @list: trap_group_list member. * @stats: Trap group statistics. * * Describes packet trap group attributes. Created by devlink during trap * group registration. */ struct devlink_trap_group_item { const struct devlink_trap_group *group; struct devlink_trap_policer_item *policer_item; struct list_head list; struct devlink_stats __percpu *stats; }; /** * struct devlink_trap_item - Packet trap attributes. * @trap: Immutable packet trap attributes. * @group_item: Associated group item. * @list: trap_list member. * @action: Trap action. * @stats: Trap statistics. * @priv: Driver private information. * * Describes both mutable and immutable packet trap attributes. Created by * devlink during trap registration and used for all trap related operations. */ struct devlink_trap_item { const struct devlink_trap *trap; struct devlink_trap_group_item *group_item; struct list_head list; enum devlink_trap_action action; struct devlink_stats __percpu *stats; void *priv; }; static struct devlink_trap_policer_item * devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id) { struct devlink_trap_policer_item *policer_item; list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { if (policer_item->policer->id == id) return policer_item; } return NULL; } static struct devlink_trap_item * devlink_trap_item_lookup(struct devlink *devlink, const char *name) { struct devlink_trap_item *trap_item; list_for_each_entry(trap_item, &devlink->trap_list, list) { if (!strcmp(trap_item->trap->name, name)) return trap_item; } return NULL; } static struct devlink_trap_item * devlink_trap_item_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr *attr; if (!info->attrs[DEVLINK_ATTR_TRAP_NAME]) return NULL; attr = info->attrs[DEVLINK_ATTR_TRAP_NAME]; return devlink_trap_item_lookup(devlink, nla_data(attr)); } static int devlink_trap_action_get_from_info(struct genl_info *info, enum devlink_trap_action *p_trap_action) { u8 val; val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); switch (val) { case DEVLINK_TRAP_ACTION_DROP: case DEVLINK_TRAP_ACTION_TRAP: case DEVLINK_TRAP_ACTION_MIRROR: *p_trap_action = val; break; default: return -EINVAL; } return 0; } static int devlink_trap_metadata_put(struct sk_buff *msg, const struct devlink_trap *trap) { struct nlattr *attr; attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA); if (!attr) return -EMSGSIZE; if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) && nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT)) goto nla_put_failure; if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) && nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, struct devlink_stats *stats) { int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { struct devlink_stats *cpu_stats; u64 rx_packets, rx_bytes; unsigned int start; cpu_stats = per_cpu_ptr(trap_stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); rx_packets = u64_stats_read(&cpu_stats->rx_packets); rx_bytes = u64_stats_read(&cpu_stats->rx_bytes); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rx_packets); u64_stats_add(&stats->rx_bytes, rx_bytes); } } static int devlink_trap_group_stats_put(struct sk_buff *msg, struct devlink_stats __percpu *trap_stats) { struct devlink_stats stats; struct nlattr *attr; devlink_trap_stats_read(trap_stats, &stats); attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); if (!attr) return -EMSGSIZE; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS, u64_stats_read(&stats.rx_packets))) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES, u64_stats_read(&stats.rx_bytes))) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_item *trap_item) { struct devlink_stats stats; struct nlattr *attr; u64 drops = 0; int err; if (devlink->ops->trap_drop_counter_get) { err = devlink->ops->trap_drop_counter_get(devlink, trap_item->trap, &drops); if (err) return err; } devlink_trap_stats_read(trap_item->stats, &stats); attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); if (!attr) return -EMSGSIZE; if (devlink->ops->trap_drop_counter_get && devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS, u64_stats_read(&stats.rx_packets))) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES, u64_stats_read(&stats.rx_bytes))) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { struct devlink_trap_group_item *group_item = trap_item->group_item; void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, group_item->group->name)) goto nla_put_failure; if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type)) goto nla_put_failure; if (trap_item->trap->generic && nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action)) goto nla_put_failure; err = devlink_trap_metadata_put(msg, trap_item->trap); if (err) goto nla_put_failure; err = devlink_trap_stats_put(msg, devlink, trap_item); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_item *trap_item; struct sk_buff *msg; int err; if (list_empty(&devlink->trap_list)) return -EOPNOTSUPP; trap_item = devlink_trap_item_get_from_info(devlink, info); if (!trap_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap"); return -ENOENT; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_trap_fill(msg, devlink, trap_item, DEVLINK_CMD_TRAP_NEW, info->snd_portid, info->snd_seq, 0); if (err) goto err_trap_fill; return genlmsg_reply(msg, info); err_trap_fill: nlmsg_free(msg); return err; } static int devlink_nl_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_item *trap_item; int idx = 0; int err = 0; list_for_each_entry(trap_item, &devlink->trap_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_trap_fill(msg, devlink, trap_item, DEVLINK_CMD_TRAP_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_trap_get_dump_one); } static int __devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, enum devlink_trap_action trap_action, struct netlink_ext_ack *extack) { int err; if (trap_item->action != trap_action && trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) { NL_SET_ERR_MSG(extack, "Cannot change action of non-drop traps. Skipping"); return 0; } err = devlink->ops->trap_action_set(devlink, trap_item->trap, trap_action, extack); if (err) return err; trap_item->action = trap_action; return 0; } static int devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, struct genl_info *info) { enum devlink_trap_action trap_action; int err; if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) return 0; err = devlink_trap_action_get_from_info(info, &trap_action); if (err) { NL_SET_ERR_MSG(info->extack, "Invalid trap action"); return -EINVAL; } return __devlink_trap_action_set(devlink, trap_item, trap_action, info->extack); } int devlink_nl_trap_set_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_item *trap_item; if (list_empty(&devlink->trap_list)) return -EOPNOTSUPP; trap_item = devlink_trap_item_get_from_info(devlink, info); if (!trap_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap"); return -ENOENT; } return devlink_trap_action_set(devlink, trap_item, info); } static struct devlink_trap_group_item * devlink_trap_group_item_lookup(struct devlink *devlink, const char *name) { struct devlink_trap_group_item *group_item; list_for_each_entry(group_item, &devlink->trap_group_list, list) { if (!strcmp(group_item->group->name, name)) return group_item; } return NULL; } static struct devlink_trap_group_item * devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id) { struct devlink_trap_group_item *group_item; list_for_each_entry(group_item, &devlink->trap_group_list, list) { if (group_item->group->id == id) return group_item; } return NULL; } static struct devlink_trap_group_item * devlink_trap_group_item_get_from_info(struct devlink *devlink, struct genl_info *info) { char *name; if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]) return NULL; name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]); return devlink_trap_group_item_lookup(devlink, name); } static int devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_group_item *group_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, group_item->group->name)) goto nla_put_failure; if (group_item->group->generic && nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) goto nla_put_failure; if (group_item->policer_item && nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, group_item->policer_item->policer->id)) goto nla_put_failure; err = devlink_trap_group_stats_put(msg, group_item->stats); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_group_item *group_item; struct sk_buff *msg; int err; if (list_empty(&devlink->trap_group_list)) return -EOPNOTSUPP; group_item = devlink_trap_group_item_get_from_info(devlink, info); if (!group_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap group"); return -ENOENT; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_trap_group_fill(msg, devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW, info->snd_portid, info->snd_seq, 0); if (err) goto err_trap_group_fill; return genlmsg_reply(msg, info); err_trap_group_fill: nlmsg_free(msg); return err; } static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_group_item *group_item; int idx = 0; int err = 0; list_for_each_entry(group_item, &devlink->trap_group_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_trap_group_fill(msg, devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_trap_group_get_dump_one); } static int __devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, enum devlink_trap_action trap_action, struct netlink_ext_ack *extack) { const char *group_name = group_item->group->name; struct devlink_trap_item *trap_item; int err; if (devlink->ops->trap_group_action_set) { err = devlink->ops->trap_group_action_set(devlink, group_item->group, trap_action, extack); if (err) return err; list_for_each_entry(trap_item, &devlink->trap_list, list) { if (strcmp(trap_item->group_item->group->name, group_name)) continue; if (trap_item->action != trap_action && trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) continue; trap_item->action = trap_action; } return 0; } list_for_each_entry(trap_item, &devlink->trap_list, list) { if (strcmp(trap_item->group_item->group->name, group_name)) continue; err = __devlink_trap_action_set(devlink, trap_item, trap_action, extack); if (err) return err; } return 0; } static int devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, struct genl_info *info, bool *p_modified) { enum devlink_trap_action trap_action; int err; if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) return 0; err = devlink_trap_action_get_from_info(info, &trap_action); if (err) { NL_SET_ERR_MSG(info->extack, "Invalid trap action"); return -EINVAL; } err = __devlink_trap_group_action_set(devlink, group_item, trap_action, info->extack); if (err) return err; *p_modified = true; return 0; } static int devlink_trap_group_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; const struct devlink_trap_policer *policer; struct nlattr **attrs = info->attrs; u32 policer_id; int err; if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) return 0; if (!devlink->ops->trap_group_set) return -EOPNOTSUPP; policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); if (policer_id && !policer_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } policer = policer_item ? policer_item->policer : NULL; err = devlink->ops->trap_group_set(devlink, group_item->group, policer, extack); if (err) return err; group_item->policer_item = policer_item; return 0; } int devlink_nl_trap_group_set_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_group_item *group_item; bool modified = false; int err; if (list_empty(&devlink->trap_group_list)) return -EOPNOTSUPP; group_item = devlink_trap_group_item_get_from_info(devlink, info); if (!group_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap group"); return -ENOENT; } err = devlink_trap_group_action_set(devlink, group_item, info, &modified); if (err) return err; err = devlink_trap_group_set(devlink, group_item, info); if (err) goto err_trap_group_set; return 0; err_trap_group_set: if (modified) NL_SET_ERR_MSG(extack, "Trap group set failed, but some changes were committed already"); return err; } static struct devlink_trap_policer_item * devlink_trap_policer_item_get_from_info(struct devlink *devlink, struct genl_info *info) { u32 id; if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) return NULL; id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); return devlink_trap_policer_item_lookup(devlink, id); } static int devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_policer *policer) { struct nlattr *attr; u64 drops; int err; if (!devlink->ops->trap_policer_counter_get) return 0; err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops); if (err) return err; attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); if (!attr) return -EMSGSIZE; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_policer_item *policer_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, policer_item->policer->id)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_RATE, policer_item->rate)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_BURST, policer_item->burst)) goto nla_put_failure; err = devlink_trap_policer_stats_put(msg, devlink, policer_item->policer); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; int err; if (list_empty(&devlink->trap_policer_list)) return -EOPNOTSUPP; policer_item = devlink_trap_policer_item_get_from_info(devlink, info); if (!policer_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW, info->snd_portid, info->snd_seq, 0); if (err) goto err_trap_policer_fill; return genlmsg_reply(msg, info); err_trap_policer_fill: nlmsg_free(msg); return err; } static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_policer_item *policer_item; int idx = 0; int err = 0; list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_trap_policer_get_dump_one); } static int devlink_trap_policer_set(struct devlink *devlink, struct devlink_trap_policer_item *policer_item, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct nlattr **attrs = info->attrs; u64 rate, burst; int err; rate = policer_item->rate; burst = policer_item->burst; if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]) rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]); if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]) burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]); if (rate < policer_item->policer->min_rate) { NL_SET_ERR_MSG(extack, "Policer rate lower than limit"); return -EINVAL; } if (rate > policer_item->policer->max_rate) { NL_SET_ERR_MSG(extack, "Policer rate higher than limit"); return -EINVAL; } if (burst < policer_item->policer->min_burst) { NL_SET_ERR_MSG(extack, "Policer burst size lower than limit"); return -EINVAL; } if (burst > policer_item->policer->max_burst) { NL_SET_ERR_MSG(extack, "Policer burst size higher than limit"); return -EINVAL; } err = devlink->ops->trap_policer_set(devlink, policer_item->policer, rate, burst, info->extack); if (err) return err; policer_item->rate = rate; policer_item->burst = burst; return 0; } int devlink_nl_trap_policer_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; if (list_empty(&devlink->trap_policer_list)) return -EOPNOTSUPP; if (!devlink->ops->trap_policer_set) return -EOPNOTSUPP; policer_item = devlink_trap_policer_item_get_from_info(devlink, info); if (!policer_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } return devlink_trap_policer_set(devlink, policer_item, info); } #define DEVLINK_TRAP(_id, _type) \ { \ .type = DEVLINK_TRAP_TYPE_##_type, \ .id = DEVLINK_TRAP_GENERIC_ID_##_id, \ .name = DEVLINK_TRAP_GENERIC_NAME_##_id, \ } static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(SMAC_MC, DROP), DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP), DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP), DEVLINK_TRAP(INGRESS_STP_FILTER, DROP), DEVLINK_TRAP(EMPTY_TX_LIST, DROP), DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP), DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), DEVLINK_TRAP(TTL_ERROR, EXCEPTION), DEVLINK_TRAP(TAIL_DROP, DROP), DEVLINK_TRAP(NON_IP_PACKET, DROP), DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), DEVLINK_TRAP(DIP_LB, DROP), DEVLINK_TRAP(SIP_MC, DROP), DEVLINK_TRAP(SIP_LB, DROP), DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), DEVLINK_TRAP(IPV4_SIP_BC, DROP), DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), DEVLINK_TRAP(MTU_ERROR, EXCEPTION), DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), DEVLINK_TRAP(RPF, EXCEPTION), DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(NON_ROUTABLE, DROP), DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(STP, CONTROL), DEVLINK_TRAP(LACP, CONTROL), DEVLINK_TRAP(LLDP, CONTROL), DEVLINK_TRAP(IGMP_QUERY, CONTROL), DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), DEVLINK_TRAP(MLD_QUERY, CONTROL), DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), DEVLINK_TRAP(MLD_V1_DONE, CONTROL), DEVLINK_TRAP(IPV4_DHCP, CONTROL), DEVLINK_TRAP(IPV6_DHCP, CONTROL), DEVLINK_TRAP(ARP_REQUEST, CONTROL), DEVLINK_TRAP(ARP_RESPONSE, CONTROL), DEVLINK_TRAP(ARP_OVERLAY, CONTROL), DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), DEVLINK_TRAP(IPV4_BFD, CONTROL), DEVLINK_TRAP(IPV6_BFD, CONTROL), DEVLINK_TRAP(IPV4_OSPF, CONTROL), DEVLINK_TRAP(IPV6_OSPF, CONTROL), DEVLINK_TRAP(IPV4_BGP, CONTROL), DEVLINK_TRAP(IPV6_BGP, CONTROL), DEVLINK_TRAP(IPV4_VRRP, CONTROL), DEVLINK_TRAP(IPV6_VRRP, CONTROL), DEVLINK_TRAP(IPV4_PIM, CONTROL), DEVLINK_TRAP(IPV6_PIM, CONTROL), DEVLINK_TRAP(UC_LB, CONTROL), DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), DEVLINK_TRAP(PTP_EVENT, CONTROL), DEVLINK_TRAP(PTP_GENERAL, CONTROL), DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), DEVLINK_TRAP(EARLY_DROP, DROP), DEVLINK_TRAP(VXLAN_PARSING, DROP), DEVLINK_TRAP(LLC_SNAP_PARSING, DROP), DEVLINK_TRAP(VLAN_PARSING, DROP), DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP), DEVLINK_TRAP(MPLS_PARSING, DROP), DEVLINK_TRAP(ARP_PARSING, DROP), DEVLINK_TRAP(IP_1_PARSING, DROP), DEVLINK_TRAP(IP_N_PARSING, DROP), DEVLINK_TRAP(GRE_PARSING, DROP), DEVLINK_TRAP(UDP_PARSING, DROP), DEVLINK_TRAP(TCP_PARSING, DROP), DEVLINK_TRAP(IPSEC_PARSING, DROP), DEVLINK_TRAP(SCTP_PARSING, DROP), DEVLINK_TRAP(DCCP_PARSING, DROP), DEVLINK_TRAP(GTP_PARSING, DROP), DEVLINK_TRAP(ESP_PARSING, DROP), DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), DEVLINK_TRAP(DMAC_FILTER, DROP), DEVLINK_TRAP(EAPOL, CONTROL), DEVLINK_TRAP(LOCKED_PORT, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ { \ .id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \ .name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \ } static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L2_DROPS), DEVLINK_TRAP_GROUP(L3_DROPS), DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), DEVLINK_TRAP_GROUP(STP), DEVLINK_TRAP_GROUP(LACP), DEVLINK_TRAP_GROUP(LLDP), DEVLINK_TRAP_GROUP(MC_SNOOPING), DEVLINK_TRAP_GROUP(DHCP), DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), DEVLINK_TRAP_GROUP(BFD), DEVLINK_TRAP_GROUP(OSPF), DEVLINK_TRAP_GROUP(BGP), DEVLINK_TRAP_GROUP(VRRP), DEVLINK_TRAP_GROUP(PIM), DEVLINK_TRAP_GROUP(UC_LB), DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), DEVLINK_TRAP_GROUP(EXTERNAL_DELIVERY), DEVLINK_TRAP_GROUP(IPV6), DEVLINK_TRAP_GROUP(PTP_EVENT), DEVLINK_TRAP_GROUP(PTP_GENERAL), DEVLINK_TRAP_GROUP(ACL_SAMPLE), DEVLINK_TRAP_GROUP(ACL_TRAP), DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS), DEVLINK_TRAP_GROUP(EAPOL), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) { if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX) return -EINVAL; if (strcmp(trap->name, devlink_trap_generic[trap->id].name)) return -EINVAL; if (trap->type != devlink_trap_generic[trap->id].type) return -EINVAL; return 0; } static int devlink_trap_driver_verify(const struct devlink_trap *trap) { int i; if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX) return -EINVAL; for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) { if (!strcmp(trap->name, devlink_trap_generic[i].name)) return -EEXIST; } return 0; } static int devlink_trap_verify(const struct devlink_trap *trap) { if (!trap || !trap->name) return -EINVAL; if (trap->generic) return devlink_trap_generic_verify(trap); else return devlink_trap_driver_verify(trap); } static int devlink_trap_group_generic_verify(const struct devlink_trap_group *group) { if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) return -EINVAL; if (strcmp(group->name, devlink_trap_group_generic[group->id].name)) return -EINVAL; return 0; } static int devlink_trap_group_driver_verify(const struct devlink_trap_group *group) { int i; if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) return -EINVAL; for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) { if (!strcmp(group->name, devlink_trap_group_generic[i].name)) return -EEXIST; } return 0; } static int devlink_trap_group_verify(const struct devlink_trap_group *group) { if (group->generic) return devlink_trap_group_generic_verify(group); else return devlink_trap_group_driver_verify(group); } static void devlink_trap_group_notify(struct devlink *devlink, const struct devlink_trap_group_item *group_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW && cmd != DEVLINK_CMD_TRAP_GROUP_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_trap_groups_notify_register(struct devlink *devlink) { struct devlink_trap_group_item *group_item; list_for_each_entry(group_item, &devlink->trap_group_list, list) devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW); } void devlink_trap_groups_notify_unregister(struct devlink *devlink) { struct devlink_trap_group_item *group_item; list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list) devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_DEL); } static int devlink_trap_item_group_link(struct devlink *devlink, struct devlink_trap_item *trap_item) { u16 group_id = trap_item->trap->init_group_id; struct devlink_trap_group_item *group_item; group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id); if (WARN_ON_ONCE(!group_item)) return -EINVAL; trap_item->group_item = group_item; return 0; } static void devlink_trap_notify(struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW && cmd != DEVLINK_CMD_TRAP_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_traps_notify_register(struct devlink *devlink) { struct devlink_trap_item *trap_item; list_for_each_entry(trap_item, &devlink->trap_list, list) devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); } void devlink_traps_notify_unregister(struct devlink *devlink) { struct devlink_trap_item *trap_item; list_for_each_entry_reverse(trap_item, &devlink->trap_list, list) devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); } static int devlink_trap_register(struct devlink *devlink, const struct devlink_trap *trap, void *priv) { struct devlink_trap_item *trap_item; int err; if (devlink_trap_item_lookup(devlink, trap->name)) return -EEXIST; trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL); if (!trap_item) return -ENOMEM; trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); if (!trap_item->stats) { err = -ENOMEM; goto err_stats_alloc; } trap_item->trap = trap; trap_item->action = trap->init_action; trap_item->priv = priv; err = devlink_trap_item_group_link(devlink, trap_item); if (err) goto err_group_link; err = devlink->ops->trap_init(devlink, trap, trap_item); if (err) goto err_trap_init; list_add_tail(&trap_item->list, &devlink->trap_list); devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); return 0; err_trap_init: err_group_link: free_percpu(trap_item->stats); err_stats_alloc: kfree(trap_item); return err; } static void devlink_trap_unregister(struct devlink *devlink, const struct devlink_trap *trap) { struct devlink_trap_item *trap_item; trap_item = devlink_trap_item_lookup(devlink, trap->name); if (WARN_ON_ONCE(!trap_item)) return; devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); list_del(&trap_item->list); if (devlink->ops->trap_fini) devlink->ops->trap_fini(devlink, trap, trap_item); free_percpu(trap_item->stats); kfree(trap_item); } static void devlink_trap_disable(struct devlink *devlink, const struct devlink_trap *trap) { struct devlink_trap_item *trap_item; trap_item = devlink_trap_item_lookup(devlink, trap->name); if (WARN_ON_ONCE(!trap_item)) return; devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP, NULL); trap_item->action = DEVLINK_TRAP_ACTION_DROP; } /** * devl_traps_register - Register packet traps with devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. * @priv: Driver private information. * * Return: Non-zero value on failure. */ int devl_traps_register(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count, void *priv) { int i, err; if (!devlink->ops->trap_init || !devlink->ops->trap_action_set) return -EINVAL; devl_assert_locked(devlink); for (i = 0; i < traps_count; i++) { const struct devlink_trap *trap = &traps[i]; err = devlink_trap_verify(trap); if (err) goto err_trap_verify; err = devlink_trap_register(devlink, trap, priv); if (err) goto err_trap_register; } return 0; err_trap_register: err_trap_verify: for (i--; i >= 0; i--) devlink_trap_unregister(devlink, &traps[i]); return err; } EXPORT_SYMBOL_GPL(devl_traps_register); /** * devlink_traps_register - Register packet traps with devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. * @priv: Driver private information. * * Context: Takes and release devlink->lock <mutex>. * * Return: Non-zero value on failure. */ int devlink_traps_register(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count, void *priv) { int err; devl_lock(devlink); err = devl_traps_register(devlink, traps, traps_count, priv); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_traps_register); /** * devl_traps_unregister - Unregister packet traps from devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. */ void devl_traps_unregister(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count) { int i; devl_assert_locked(devlink); /* Make sure we do not have any packets in-flight while unregistering * traps by disabling all of them and waiting for a grace period. */ for (i = traps_count - 1; i >= 0; i--) devlink_trap_disable(devlink, &traps[i]); synchronize_rcu(); for (i = traps_count - 1; i >= 0; i--) devlink_trap_unregister(devlink, &traps[i]); } EXPORT_SYMBOL_GPL(devl_traps_unregister); /** * devlink_traps_unregister - Unregister packet traps from devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. * * Context: Takes and release devlink->lock <mutex>. */ void devlink_traps_unregister(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count) { devl_lock(devlink); devl_traps_unregister(devlink, traps, traps_count); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_traps_unregister); static void devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats, size_t skb_len) { struct devlink_stats *stats; stats = this_cpu_ptr(trap_stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->rx_bytes, skb_len); u64_stats_inc(&stats->rx_packets); u64_stats_update_end(&stats->syncp); } static void devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata, const struct devlink_trap_item *trap_item, struct devlink_port *in_devlink_port, const struct flow_action_cookie *fa_cookie) { metadata->trap_name = trap_item->trap->name; metadata->trap_group_name = trap_item->group_item->group->name; metadata->fa_cookie = fa_cookie; metadata->trap_type = trap_item->trap->type; spin_lock(&in_devlink_port->type_lock); if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH) metadata->input_dev = in_devlink_port->type_eth.netdev; spin_unlock(&in_devlink_port->type_lock); } /** * devlink_trap_report - Report trapped packet to drop monitor. * @devlink: devlink. * @skb: Trapped packet. * @trap_ctx: Trap context. * @in_devlink_port: Input devlink port. * @fa_cookie: Flow action cookie. Could be NULL. */ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, void *trap_ctx, struct devlink_port *in_devlink_port, const struct flow_action_cookie *fa_cookie) { struct devlink_trap_item *trap_item = trap_ctx; devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); if (tracepoint_enabled(devlink_trap_report)) { struct devlink_trap_metadata metadata = {}; devlink_trap_report_metadata_set(&metadata, trap_item, in_devlink_port, fa_cookie); trace_devlink_trap_report(devlink, skb, &metadata); } } EXPORT_SYMBOL_GPL(devlink_trap_report); /** * devlink_trap_ctx_priv - Trap context to driver private information. * @trap_ctx: Trap context. * * Return: Driver private information passed during registration. */ void *devlink_trap_ctx_priv(void *trap_ctx) { struct devlink_trap_item *trap_item = trap_ctx; return trap_item->priv; } EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv); static int devlink_trap_group_item_policer_link(struct devlink *devlink, struct devlink_trap_group_item *group_item) { u32 policer_id = group_item->group->init_policer_id; struct devlink_trap_policer_item *policer_item; if (policer_id == 0) return 0; policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); if (WARN_ON_ONCE(!policer_item)) return -EINVAL; group_item->policer_item = policer_item; return 0; } static int devlink_trap_group_register(struct devlink *devlink, const struct devlink_trap_group *group) { struct devlink_trap_group_item *group_item; int err; if (devlink_trap_group_item_lookup(devlink, group->name)) return -EEXIST; group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); if (!group_item) return -ENOMEM; group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); if (!group_item->stats) { err = -ENOMEM; goto err_stats_alloc; } group_item->group = group; err = devlink_trap_group_item_policer_link(devlink, group_item); if (err) goto err_policer_link; if (devlink->ops->trap_group_init) { err = devlink->ops->trap_group_init(devlink, group); if (err) goto err_group_init; } list_add_tail(&group_item->list, &devlink->trap_group_list); devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW); return 0; err_group_init: err_policer_link: free_percpu(group_item->stats); err_stats_alloc: kfree(group_item); return err; } static void devlink_trap_group_unregister(struct devlink *devlink, const struct devlink_trap_group *group) { struct devlink_trap_group_item *group_item; group_item = devlink_trap_group_item_lookup(devlink, group->name); if (WARN_ON_ONCE(!group_item)) return; devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_DEL); list_del(&group_item->list); free_percpu(group_item->stats); kfree(group_item); } /** * devl_trap_groups_register - Register packet trap groups with devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. * * Return: Non-zero value on failure. */ int devl_trap_groups_register(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { int i, err; devl_assert_locked(devlink); for (i = 0; i < groups_count; i++) { const struct devlink_trap_group *group = &groups[i]; err = devlink_trap_group_verify(group); if (err) goto err_trap_group_verify; err = devlink_trap_group_register(devlink, group); if (err) goto err_trap_group_register; } return 0; err_trap_group_register: err_trap_group_verify: for (i--; i >= 0; i--) devlink_trap_group_unregister(devlink, &groups[i]); return err; } EXPORT_SYMBOL_GPL(devl_trap_groups_register); /** * devlink_trap_groups_register - Register packet trap groups with devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. * * Context: Takes and release devlink->lock <mutex>. * * Return: Non-zero value on failure. */ int devlink_trap_groups_register(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { int err; devl_lock(devlink); err = devl_trap_groups_register(devlink, groups, groups_count); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_trap_groups_register); /** * devl_trap_groups_unregister - Unregister packet trap groups from devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. */ void devl_trap_groups_unregister(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { int i; devl_assert_locked(devlink); for (i = groups_count - 1; i >= 0; i--) devlink_trap_group_unregister(devlink, &groups[i]); } EXPORT_SYMBOL_GPL(devl_trap_groups_unregister); /** * devlink_trap_groups_unregister - Unregister packet trap groups from devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. * * Context: Takes and release devlink->lock <mutex>. */ void devlink_trap_groups_unregister(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { devl_lock(devlink); devl_trap_groups_unregister(devlink, groups, groups_count); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister); static void devlink_trap_policer_notify(struct devlink *devlink, const struct devlink_trap_policer_item *policer_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && cmd != DEVLINK_CMD_TRAP_POLICER_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_trap_policers_notify_register(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; list_for_each_entry(policer_item, &devlink->trap_policer_list, list) devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW); } void devlink_trap_policers_notify_unregister(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list, list) devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_DEL); } static int devlink_trap_policer_register(struct devlink *devlink, const struct devlink_trap_policer *policer) { struct devlink_trap_policer_item *policer_item; int err; if (devlink_trap_policer_item_lookup(devlink, policer->id)) return -EEXIST; policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL); if (!policer_item) return -ENOMEM; policer_item->policer = policer; policer_item->rate = policer->init_rate; policer_item->burst = policer->init_burst; if (devlink->ops->trap_policer_init) { err = devlink->ops->trap_policer_init(devlink, policer); if (err) goto err_policer_init; } list_add_tail(&policer_item->list, &devlink->trap_policer_list); devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW); return 0; err_policer_init: kfree(policer_item); return err; } static void devlink_trap_policer_unregister(struct devlink *devlink, const struct devlink_trap_policer *policer) { struct devlink_trap_policer_item *policer_item; policer_item = devlink_trap_policer_item_lookup(devlink, policer->id); if (WARN_ON_ONCE(!policer_item)) return; devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_DEL); list_del(&policer_item->list); if (devlink->ops->trap_policer_fini) devlink->ops->trap_policer_fini(devlink, policer); kfree(policer_item); } /** * devl_trap_policers_register - Register packet trap policers with devlink. * @devlink: devlink. * @policers: Packet trap policers. * @policers_count: Count of provided packet trap policers. * * Return: Non-zero value on failure. */ int devl_trap_policers_register(struct devlink *devlink, const struct devlink_trap_policer *policers, size_t policers_count) { int i, err; devl_assert_locked(devlink); for (i = 0; i < policers_count; i++) { const struct devlink_trap_policer *policer = &policers[i]; if (WARN_ON(policer->id == 0 || policer->max_rate < policer->min_rate || policer->max_burst < policer->min_burst)) { err = -EINVAL; goto err_trap_policer_verify; } err = devlink_trap_policer_register(devlink, policer); if (err) goto err_trap_policer_register; } return 0; err_trap_policer_register: err_trap_policer_verify: for (i--; i >= 0; i--) devlink_trap_policer_unregister(devlink, &policers[i]); return err; } EXPORT_SYMBOL_GPL(devl_trap_policers_register); /** * devl_trap_policers_unregister - Unregister packet trap policers from devlink. * @devlink: devlink. * @policers: Packet trap policers. * @policers_count: Count of provided packet trap policers. */ void devl_trap_policers_unregister(struct devlink *devlink, const struct devlink_trap_policer *policers, size_t policers_count) { int i; devl_assert_locked(devlink); for (i = policers_count - 1; i >= 0; i--) devlink_trap_policer_unregister(devlink, &policers[i]); } EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
35 1161 267 268 1 1 7 7 284 284 166 166 167 167 9 9 9 7316 7319 28 28 28 28 28 28 28 28 28 1558 1560 1553 1558 1560 1564 1564 1559 142 1553 1556 1558 1558 1480 1484 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * Per construction; when: * * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and * begin_new_exec()). */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); p->vfork_done = &kthread->exited; p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param); set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker(int cpu, unsigned int flags, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); if (cpu >= 0) node = cpu_to_node(cpu); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; if (cpu >= 0) kthread_bind(task, cpu); worker->flags = flags; worker->task = task; wake_up_process(task); return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker - create a kthread worker * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker(-1, flags, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker(cpu, flags, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_worker_create(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * del_timer_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); del_timer_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif
23 23 252 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BRIDGE_NETFILTER_H #define __LINUX_BRIDGE_NETFILTER_H #include <uapi/linux/netfilter_bridge.h> #include <linux/skbuff.h> struct nf_bridge_frag_data { char mac[ETH_HLEN]; bool vlan_present; u16 vlan_tci; __be16 vlan_proto; }; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); static inline void br_drop_fake_rtable(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && (dst->flags & DST_FAKE_RTABLE)) skb_dst_drop(skb); } static inline struct nf_bridge_info * nf_bridge_info_get(const struct sk_buff *skb) { return skb_ext_find(skb, SKB_EXT_BRIDGE_NF); } static inline bool nf_bridge_info_exists(const struct sk_buff *skb) { return skb_ext_exist(skb, SKB_EXT_BRIDGE_NF); } static inline int nf_bridge_get_physinif(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return 0; return nf_bridge->physinif; } static inline int nf_bridge_get_physoutif(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return 0; return nf_bridge->physoutdev ? nf_bridge->physoutdev->ifindex : 0; } static inline struct net_device * nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); return nf_bridge ? dev_get_by_index_rcu(net, nf_bridge->physinif) : NULL; } static inline struct net_device * nf_bridge_get_physoutdev(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); return nf_bridge ? nf_bridge->physoutdev : NULL; } static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); return nf_bridge && nf_bridge->in_prerouting; } #else #define br_drop_fake_rtable(skb) do { } while (0) static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) { return false; } #endif /* CONFIG_BRIDGE_NETFILTER */ #endif
82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dccp/timer.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/skbuff.h> #include <linux/export.h> #include "dccp.h" /* sysctl variables governing numbers of retransmission attempts */ int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES; int sysctl_dccp_retries1 __read_mostly = TCP_RETR1; int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; static void dccp_write_err(struct sock *sk) { sk->sk_err = READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT; sk_error_report(sk); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_done(sk); __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT); } /* A write timeout has occurred. Process the after effects. */ static int dccp_write_timeout(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { if (icsk->icsk_retransmits != 0) dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : sysctl_dccp_request_retries; } else { if (icsk->icsk_retransmits >= sysctl_dccp_retries1) { /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black hole detection. :-( It is place to make it. It is not made. I do not want to make it. It is disguisting. It does not work in any case. Let me to cite the same draft, which requires for us to implement this: "The one security concern raised by this memo is that ICMP black holes are often caused by over-zealous security administrators who block all ICMP messages. It is vitally important that those who design and deploy security systems understand the impact of strict filtering on upper-layer protocols. The safest web site in the world is worthless if most TCP implementations cannot transfer data from it. It would be far nicer to have all of the black holes fixed rather than fixing all of the TCP implementations." Golden words :-). */ dst_negative_advice(sk); } retry_until = sysctl_dccp_retries2; /* * FIXME: see tcp_write_timout and tcp_out_of_resources */ } if (icsk->icsk_retransmits >= retry_until) { /* Has it gone just too far? */ dccp_write_err(sk); return 1; } return 0; } /* * The DCCP retransmit timer. */ static void dccp_retransmit_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* * More than 4MSL (8 minutes) has passed, a RESET(aborted) was * sent, no need to retransmit, this sock is dead. */ if (dccp_write_timeout(sk)) return; /* * We want to know the number of packets retransmitted, not the * total number of retransmissions of clones of original packets. */ if (icsk->icsk_retransmits == 0) __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS); if (dccp_retransmit_skb(sk) != 0) { /* * Retransmission failed because of local congestion, * do not backoff. */ if (--icsk->icsk_retransmits == 0) icsk->icsk_retransmits = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), DCCP_RTO_MAX); return; } icsk->icsk_backoff++; icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, DCCP_RTO_MAX); if (icsk->icsk_retransmits > sysctl_dccp_retries1) __sk_dst_reset(sk); } static void dccp_write_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; int event = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); goto out; } if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) goto out; if (time_after(icsk->icsk_timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); goto out; } event = icsk->icsk_pending; icsk->icsk_pending = 0; switch (event) { case ICSK_TIME_RETRANS: dccp_retransmit_timer(sk); break; } out: bh_unlock_sock(sk); sock_put(sk); } static void dccp_keepalive_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); pr_err("dccp should not use a keepalive timer !\n"); sock_put(sk); } /* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ static void dccp_delack_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_delack_timer); struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out; } if (sk->sk_state == DCCP_CLOSED || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; if (time_after(icsk->icsk_ack.timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); goto out; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (inet_csk_ack_scheduled(sk)) { if (!inet_csk_in_pingpong_mode(sk)) { /* Delayed ACK missed: inflate ATO. */ icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } dccp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } out: bh_unlock_sock(sk); sock_put(sk); } /** * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface * @t: pointer to the tasklet associated with this handler * * See the comments above %ccid_dequeueing_decision for supported modes. */ static void dccp_write_xmitlet(struct tasklet_struct *t) { struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet); struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); else dccp_write_xmit(sk); bh_unlock_sock(sk); sock_put(sk); } static void dccp_write_xmit_timer(struct timer_list *t) { struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); dccp_write_xmitlet(&dp->dccps_xmitlet); } void dccp_init_xmit_timers(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet); timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } static ktime_t dccp_timestamp_seed; /** * dccp_timestamp - 10s of microseconds time source * Returns the number of 10s of microseconds since loading DCCP. This is native * DCCP time difference format (RFC 4340, sec. 13). * Please note: This will wrap around about circa every 11.9 hours. */ u32 dccp_timestamp(void) { u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); do_div(delta, 10); return delta; } EXPORT_SYMBOL_GPL(dccp_timestamp); void __init dccp_timestamping_init(void) { dccp_timestamp_seed = ktime_get_real(); }
10 10 4 4 4 8 4376 121 7 125 45 87 64 434 4230 2119 2189 2175 434 20 7 33 1 4 8 10 68 4139 68 4140 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * VLAN An implementation of 802.1Q VLAN tagging. * * Authors: Ben Greear <greearb@candelatech.com> */ #ifndef _LINUX_IF_VLAN_H_ #define _LINUX_IF_VLAN_H_ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/bug.h> #include <uapi/linux/if_vlan.h> #define VLAN_HLEN 4 /* The additional bytes required by VLAN * (in addition to the Ethernet header) */ #define VLAN_ETH_HLEN 18 /* Total octets in header. */ #define VLAN_ETH_ZLEN 64 /* Min. octets in frame sans FCS */ /* * According to 802.3ac, the packet can be 4 bytes longer. --Klika Jan */ #define VLAN_ETH_DATA_LEN 1500 /* Max. octets in payload */ #define VLAN_ETH_FRAME_LEN 1518 /* Max. octets in frame sans FCS */ #define VLAN_MAX_DEPTH 8 /* Max. number of nested VLAN tags parsed */ /* * struct vlan_hdr - vlan header * @h_vlan_TCI: priority and VLAN ID * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_hdr { __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; /** * struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr) * @h_dest: destination ethernet address * @h_source: source ethernet address * @h_vlan_proto: ethernet protocol * @h_vlan_TCI: priority and VLAN ID * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_ethhdr { struct_group(addrs, unsigned char h_dest[ETH_ALEN]; unsigned char h_source[ETH_ALEN]; ); __be16 h_vlan_proto; __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; #include <linux/skbuff.h> static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) { return (struct vlan_ethhdr *)skb_mac_header(skb); } /* Prefer this version in TX path, instead of * skb_reset_mac_header() + vlan_eth_hdr() */ static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb) { return (struct vlan_ethhdr *)skb->data; } #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ #define VLAN_PRIO_SHIFT 13 #define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */ #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ #define VLAN_N_VID 4096 /* found in socket.c */ extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *)); static inline bool is_vlan_dev(const struct net_device *dev) { return dev->priv_flags & IFF_802_1Q_VLAN; } #define skb_vlan_tag_present(__skb) (!!(__skb)->vlan_all) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_cfi(__skb) (!!((__skb)->vlan_tci & VLAN_CFI_MASK)) #define skb_vlan_tag_get_prio(__skb) (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT) static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev) { ASSERT_RTNL(); return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev)); } static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev); } static inline int vlan_get_rx_stag_filter_info(struct net_device *dev) { ASSERT_RTNL(); return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev)); } static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev); } /** * struct vlan_pcpu_stats - VLAN percpu rx/tx stats * @rx_packets: number of received packets * @rx_bytes: number of received bytes * @rx_multicast: number of received multicast packets * @tx_packets: number of transmitted packets * @tx_bytes: number of transmitted bytes * @syncp: synchronization point for 64bit counters * @rx_errors: number of rx errors * @tx_dropped: number of tx drops */ struct vlan_pcpu_stats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_multicast; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_errors; u32 tx_dropped; }; #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id); extern int vlan_for_each(struct net_device *dev, int (*action)(struct net_device *dev, int vid, void *arg), void *arg); extern struct net_device *vlan_dev_real_dev(const struct net_device *dev); extern u16 vlan_dev_vlan_id(const struct net_device *dev); extern __be16 vlan_dev_vlan_proto(const struct net_device *dev); /** * struct vlan_priority_tci_mapping - vlan egress priority mappings * @priority: skb priority * @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000 * @next: pointer to next struct */ struct vlan_priority_tci_mapping { u32 priority; u16 vlan_qos; struct vlan_priority_tci_mapping *next; }; struct proc_dir_entry; struct netpoll; /** * struct vlan_dev_priv - VLAN private device data * @nr_ingress_mappings: number of ingress priority mappings * @ingress_priority_map: ingress priority mappings * @nr_egress_mappings: number of egress priority mappings * @egress_priority_map: hash of egress priority mappings * @vlan_proto: VLAN encapsulation protocol * @vlan_id: VLAN identifier * @flags: device flags * @real_dev: underlying netdevice * @dev_tracker: refcount tracker for @real_dev reference * @real_dev_addr: address of underlying netdevice * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats */ struct vlan_dev_priv { unsigned int nr_ingress_mappings; u32 ingress_priority_map[8]; unsigned int nr_egress_mappings; struct vlan_priority_tci_mapping *egress_priority_map[16]; __be16 vlan_proto; u16 vlan_id; u16 flags; struct net_device *real_dev; netdevice_tracker dev_tracker; unsigned char real_dev_addr[ETH_ALEN]; struct proc_dir_entry *dent; struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif }; static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev) { return netdev_priv(dev); } static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio) { struct vlan_priority_tci_mapping *mp; smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */ mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)]; while (mp) { if (mp->priority == skprio) { return mp->vlan_qos; /* This should already be shifted * to mask correctly with the * VLAN's TCI */ } mp = mp->next; } return 0; } extern bool vlan_do_receive(struct sk_buff **skb); extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid); extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid); extern int vlan_vids_add_by_dev(struct net_device *dev, const struct net_device *by_dev); extern void vlan_vids_del_by_dev(struct net_device *dev, const struct net_device *by_dev); extern bool vlan_uses_dev(const struct net_device *dev); #else static inline struct net_device * __vlan_find_dev_deep_rcu(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id) { return NULL; } static inline int vlan_for_each(struct net_device *dev, int (*action)(struct net_device *dev, int vid, void *arg), void *arg) { return 0; } static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev) { BUG(); return NULL; } static inline u16 vlan_dev_vlan_id(const struct net_device *dev) { BUG(); return 0; } static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev) { BUG(); return 0; } static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio) { return 0; } static inline bool vlan_do_receive(struct sk_buff **skb) { return false; } static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid) { return 0; } static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid) { } static inline int vlan_vids_add_by_dev(struct net_device *dev, const struct net_device *by_dev) { return 0; } static inline void vlan_vids_del_by_dev(struct net_device *dev, const struct net_device *by_dev) { } static inline bool vlan_uses_dev(const struct net_device *dev) { return false; } #endif /** * eth_type_vlan - check for valid vlan ether type. * @ethertype: ether type to check * * Returns true if the ether type is a vlan ether type. */ static inline bool eth_type_vlan(__be16 ethertype) { switch (ethertype) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): return true; default: return false; } } static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX) return true; if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX) return true; return false; } /** * __vlan_insert_inner_tag - inner VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Returns error if skb_cow_head fails. * * Does not change skb->protocol so this function can be used during receive. */ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { struct vlan_ethhdr *veth; if (skb_cow_head(skb, VLAN_HLEN) < 0) return -ENOMEM; skb_push(skb, VLAN_HLEN); /* Move the mac header sans proto to the beginning of the new header. */ if (likely(mac_len > ETH_TLEN)) memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); if (skb_mac_header_was_set(skb)) skb->mac_header -= VLAN_HLEN; veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN); /* first, the ethernet type */ if (likely(mac_len >= ETH_TLEN)) { /* h_vlan_encapsulated_proto should already be populated, and * skb->data has space for h_vlan_proto */ veth->h_vlan_proto = vlan_proto; } else { /* h_vlan_encapsulated_proto should not be populated, and * skb->data has no space for h_vlan_proto */ veth->h_vlan_encapsulated_proto = skb->protocol; } /* now, the TCI */ veth->h_vlan_TCI = htons(vlan_tci); return 0; } /** * __vlan_insert_tag - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Returns error if skb_cow_head fails. * * Does not change skb->protocol so this function can be used during receive. */ static inline int __vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); } /** * vlan_insert_inner_tag - inner VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Returns a VLAN tagged skb. This might change skb->head. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. */ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { int err; err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len); if (err) { dev_kfree_skb_any(skb); return NULL; } return skb; } /** * vlan_insert_tag - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Returns a VLAN tagged skb. This might change skb->head. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. */ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); } /** * vlan_insert_tag_set_proto - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Returns a VLAN tagged skb. This might change skb->head. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. */ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { skb = vlan_insert_tag(skb, vlan_proto, vlan_tci); if (skb) skb->protocol = vlan_proto; return skb; } /** * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info * @skb: skbuff to clear * * Clears the VLAN information from @skb */ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { skb->vlan_all = 0; } /** * __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb * @dst: skbuff to copy to * @src: skbuff to copy from * * Copies VLAN information from @src to @dst (for branchless code) */ static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) { dst->vlan_all = src->vlan_all; } /* * __vlan_hwaccel_push_inside - pushes vlan tag to the payload * @skb: skbuff to tag * * Pushes the VLAN tag from @skb->vlan_tci inside to the payload. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. */ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb) { skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (likely(skb)) __vlan_hwaccel_clear_tag(skb); return skb; } /** * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest */ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { skb->vlan_proto = vlan_proto; skb->vlan_tci = vlan_tci; } /** * __vlan_get_tag - get the VLAN ID that is part of the payload * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns error if the skb is not of VLAN type */ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb); if (!eth_type_vlan(veth->h_vlan_proto)) return -ENODATA; *vlan_tci = ntohs(veth->h_vlan_TCI); return 0; } /** * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[] * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns error if @skb->vlan_tci is not set correctly */ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { if (skb_vlan_tag_present(skb)) { *vlan_tci = skb_vlan_tag_get(skb); return 0; } else { *vlan_tci = 0; return -ENODATA; } } /** * vlan_get_tag - get the VLAN ID from the skb * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns error if the skb is not VLAN tagged */ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) { return __vlan_hwaccel_get_tag(skb, vlan_tci); } else { return __vlan_get_tag(skb, vlan_tci); } } /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; /* if type is 802.1Q/AD then the header should already be * present at mac_len - VLAN_HLEN (if mac_len > 0), or at * ETH_HLEN otherwise */ if (eth_type_vlan(type)) { if (vlan_depth) { if (WARN_ON(vlan_depth < VLAN_HLEN)) return 0; vlan_depth -= VLAN_HLEN; } else { vlan_depth = ETH_HLEN; } do { struct vlan_hdr vhdr, *vh; vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } while (eth_type_vlan(type)); } if (depth) *depth = vlan_depth; return type; } /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) { return __vlan_get_protocol(skb, skb->protocol, NULL); } /* This version of __vlan_get_protocol() also pulls mac header in skb->head */ static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb, __be16 type, int *depth) { int maclen; type = __vlan_get_protocol(skb, type, &maclen); if (type) { if (!pskb_may_pull(skb, maclen)) type = 0; else if (depth) *depth = maclen; } return type; } /* A getter for the SKB protocol field which will handle VLAN tags consistently * whether VLAN acceleration is enabled or not. */ static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) { if (!skip_vlan) /* VLAN acceleration strips the VLAN header from the skb and * moves it to skb->vlan_proto */ return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; return vlan_get_protocol(skb); } static inline void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) { __be16 proto; unsigned short *rawp; /* * Was a VLAN packet, grab the encapsulated protocol, which the layer * three protocols care about. */ proto = vhdr->h_vlan_encapsulated_proto; if (eth_proto_is_802_3(proto)) { skb->protocol = proto; return; } rawp = (unsigned short *)(vhdr + 1); if (*rawp == 0xFFFF) /* * This is a magic hack to spot IPX packets. Older Novell * breaks the protocol design and runs IPX over 802.3 without * an 802.2 LLC layer. We look for FFFF which isn't a used * 802.2 SSAP/DSAP. This won't work for fault tolerant netware * but does for the rest. */ skb->protocol = htons(ETH_P_802_3); else /* * Real 802.2 LLC */ skb->protocol = htons(ETH_P_802_2); } /** * vlan_remove_tag - remove outer VLAN tag from payload * @skb: skbuff to remove tag from * @vlan_tci: buffer to store value * * Expects the skb to contain a VLAN tag in the payload, and to have skb->data * pointing at the MAC header. * * Returns a new pointer to skb->data, or NULL on failure to pull. */ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); *vlan_tci = ntohs(vhdr->h_vlan_TCI); memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); vlan_set_encap_proto(skb, vhdr); return __skb_pull(skb, VLAN_HLEN); } /** * skb_vlan_tagged - check if skb is vlan tagged. * @skb: skbuff to query * * Returns true if the skb is tagged, regardless of whether it is hardware * accelerated or not. */ static inline bool skb_vlan_tagged(const struct sk_buff *skb) { if (!skb_vlan_tag_present(skb) && likely(!eth_type_vlan(skb->protocol))) return false; return true; } /** * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers. * @skb: skbuff to query * * Returns true if the skb is tagged with multiple vlan headers, regardless * of whether it is hardware accelerated or not. */ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) { __be16 protocol = skb->protocol; if (!skb_vlan_tag_present(skb)) { struct vlan_ethhdr *veh; if (likely(!eth_type_vlan(protocol))) return false; if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) return false; veh = skb_vlan_eth_hdr(skb); protocol = veh->h_vlan_encapsulated_proto; } if (!eth_type_vlan(protocol)) return false; return true; } /** * vlan_features_check - drop unsafe features for skb with multiple tags. * @skb: skbuff to query * @features: features to be checked * * Returns features without unsafe ones if the skb has multiple tags. */ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tagged_multi(skb)) { /* In the case of multi-tagged packets, use a direct mask * instead of using netdev_interesect_features(), to make * sure that only devices supporting NETIF_F_HW_CSUM will * have checksum offloading support. */ features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; } return features; } /** * compare_vlan_header - Compare two vlan headers * @h1: Pointer to vlan header * @h2: Pointer to vlan header * * Compare two vlan headers, returns 0 if equal. * * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits. */ static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1, const struct vlan_hdr *h2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return *(u32 *)h1 ^ *(u32 *)h2; #else return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) | ((__force u32)h1->h_vlan_encapsulated_proto ^ (__force u32)h2->h_vlan_encapsulated_proto); #endif } #endif /* !(_LINUX_IF_VLAN_H_) */
1 1 16066 16066 7577 15340 15363 15341 15398 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMZONE_H #define _LINUX_MMZONE_H #ifndef __ASSEMBLY__ #ifndef __GENERATING_BOUNDS_H #include <linux/spinlock.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/numa.h> #include <linux/init.h> #include <linux/seqlock.h> #include <linux/nodemask.h> #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/local_lock.h> #include <linux/zswap.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER #define MAX_PAGE_ORDER 10 #else #define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif #define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) #define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should * coalesce naturally under reasonable reclaim pressure and those which * will not. */ #define PAGE_ALLOC_COSTLY_ORDER 3 enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. */ MIGRATE_CMA, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES }; /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) # define is_migrate_cma_folio(folio, pfn) (MIGRATE_CMA == \ get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK)) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false # define is_migrate_cma_folio(folio, pfn) false #endif static inline bool is_migrate_movable(int mt) { return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } /* * Check whether a migratetype can be merged with another migratetype. * * It is only mergeable when it can fall back to other migratetypes for * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c. */ static inline bool migratetype_is_mergeable(int mt) { return mt < MIGRATE_PCPTYPES; } #define for_each_migratetype_order(order, type) \ for (order = 0; order < NR_PAGE_ORDERS; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; #define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1) #define get_pageblock_migratetype(page) \ get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) #define folio_migratetype(folio) \ get_pfnblock_flags_mask(&folio->page, folio_pfn(folio), \ MIGRATETYPE_MASK) struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; }; struct pglist_data; #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ NUMA_FOREIGN, /* was intended here, hit elsewhere */ NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ NR_VM_NUMA_EVENT_ITEMS }; #else #define NR_VM_NUMA_EVENT_ITEMS 0 #endif enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, #ifdef CONFIG_UNACCEPTED_MEMORY NR_UNACCEPTED, #endif NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_FILE, WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_FILE, WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_FILE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */ #ifdef CONFIG_IOMMU_SUPPORT NR_IOMMU_PAGES, /* # of pages allocated by IOMMU */ #endif #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ PGPROMOTE_CANDIDATE, /* candidate pages to promote */ #endif /* PGDEMOTE_*: pages demoted */ PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif NR_VM_NODE_STAT_ITEMS }; /* * Returns true if the item should be printed in THPs (/proc/vmstat * currently prints number of anon, file and shmem THPs. But the item * is charged in pages). */ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return false; return item == NR_ANON_THPS || item == NR_FILE_THPS || item == NR_SHMEM_THPS || item == NR_SHMEM_PMDMAPPED || item == NR_FILE_PMDMAPPED; } /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation * might be different. */ static __always_inline bool vmstat_item_in_bytes(int idx) { /* * Global and per-node slab counters track slab pages. * It's expected that changes are multiples of PAGE_SIZE. * Internally values are stored in pages. * * Per-memcg and per-lruvec counters track memory, consumed * by individual slab objects. These counters are actually * byte-precise. */ return (idx == NR_SLAB_RECLAIMABLE_B || idx == NR_SLAB_UNRECLAIMABLE_B); } /* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in * the array than the corresponding inactive lists, and to keep * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. * * This has to be kept in sync with the statistics in zone_stat_item * above and the descriptions in vmstat_text in mm/vmstat.c */ #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_ISOLATED, VMSCAN_THROTTLE_NOPROGRESS, VMSCAN_THROTTLE_CONGESTED, NR_VMSCAN_THROTTLE, }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } #define WORKINGSET_ANON 0 #define WORKINGSET_FILE 1 #define ANON_AND_FILE 2 enum lruvec_flags { /* * An lruvec has many dirty pages backed by a congested BDI: * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim. * It can be cleared by cgroup reclaim or kswapd. * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim. * It can only be cleared by kswapd. * * Essentially, kswapd can unthrottle an lruvec throttled by cgroup * reclaim, but not vice versa. This only applies to the root cgroup. * The goal is to prevent cgroup reclaim on the root cgroup (e.g. * memory.reclaim) to unthrottle an unbalanced node (that was throttled * by kswapd). */ LRUVEC_CGROUP_CONGESTED, LRUVEC_NODE_CONGESTED, }; #endif /* !__GENERATING_BOUNDS_H */ /* * Evictable pages are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while * a page is on one of lrugen->folios[]. Otherwise it stores 0. * * A page is added to the youngest generation on faulting. The aging needs to * check the accessed bit at least twice before handing this page over to the * eviction. The first check takes care of the accessed bit set on the initial * fault; the second check makes sure this page hasn't been used since then. * This process, AKA second chance, requires a minimum of two generations, * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive * LRU, e.g., /proc/vmstat, these two generations are considered active; the * rest of generations, if they exist, are considered inactive. See * lru_gen_is_active(). * * PG_active is always cleared while a page is on one of lrugen->folios[] so * that the aging needs not to worry about it. And it's set again when a page * considered active is isolated for non-reclaiming purposes, e.g., migration. * See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits * in folio->flags. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* * Each generation is divided into multiple tiers. A page accessed N times * through file descriptors is in tier order_base_2(N). A page in the first tier * (N=0,1) is marked by PG_referenced unless it was faulted in through page * tables or read ahead. A page in any other tier (N>1) is marked by * PG_referenced and PG_workingset. This implies a minimum of two tiers is * supported without using additional bits in folio->flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, * comparisons of refaulted/(evicted+protected) from the first tier and the * rest infer whether pages accessed multiple times through file descriptors * are statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in * folio->flags. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H struct lruvec; struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) #ifdef CONFIG_LRU_GEN enum { LRU_GEN_ANON, LRU_GEN_FILE, }; enum { LRU_GEN_CORE, LRU_GEN_MM_WALK, LRU_GEN_NONLEAF_YOUNG, NR_LRU_GEN_CAPS }; #define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) /* whether to keep historical stats from evicted generations */ #ifdef CONFIG_LRU_GEN_STATS #define NR_HIST_GENS MAX_NR_GENS #else #define NR_HIST_GENS 1U #endif /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are * stored in min_seq[] separately for anon and file types as clean file pages * can be evicted regardless of swap constraints. * * Normally anon and file min_seq are in sync. But if swapping is constrained, * e.g., out of swap space, file min_seq is allowed to advance and leave anon * min_seq behind. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ unsigned long min_seq[ANON_AND_FILE]; /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; /* the first tier doesn't need protection, hence the minus one */ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; /* the memcg generation this lru_gen_folio belongs to */ u8 gen; /* the list segment this lru_gen_folio belongs to */ u8 seg; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_node list; }; enum { MM_LEAF_TOTAL, /* total leaf entries */ MM_LEAF_YOUNG, /* young leaf entries */ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ NR_MM_STATS }; /* double-buffering Bloom filters */ #define NR_BLOOM_FILTERS 2 struct lru_gen_mm_state { /* synced with max_seq after each iteration */ unsigned long seq; /* where the current iteration continues after */ struct list_head *head; /* where the last iteration ended before */ struct list_head *tail; /* Bloom filters flip after each iteration */ unsigned long *filters[NR_BLOOM_FILTERS]; /* the mm stats for debugging */ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; }; struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; /* max_seq from lru_gen_folio: can be out of date */ unsigned long seq; /* the next address within an mm to scan */ unsigned long next_addr; /* to batch promoted pages */ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* to batch the mm stats */ int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; bool can_swap; bool force_scan; }; /* * For each node, memcgs are divided into two generations: the old and the * young. For each generation, memcgs are randomly sharded into multiple bins * to improve scalability. For each bin, the hlist_nulls is virtually divided * into three segments: the head, the tail and the default. * * An onlining memcg is added to the tail of a random bin in the old generation. * The eviction starts at the head of a random bin in the old generation. The * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes * the old generation, is incremented when all its bins become empty. * * There are four operations: * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its * current generation (old or young) and updates its "seg" to "head"; * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its * current generation (old or young) and updates its "seg" to "tail"; * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old * generation, updates its "gen" to "old" and resets its "seg" to "default"; * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the * young generation, updates its "gen" to "young" and resets its "seg" to * "default". * * The events that trigger the above operations are: * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; * 2. The first attempt to reclaim a memcg below low, which triggers * MEMCG_LRU_TAIL; * 3. The first attempt to reclaim a memcg offlined or below reclaimable size * threshold, which triggers MEMCG_LRU_TAIL; * 4. The second attempt to reclaim a memcg offlined or below reclaimable size * threshold, which triggers MEMCG_LRU_YOUNG; * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD. * * Notes: * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing * of their max_seq counters ensures the eventual fairness to all eligible * memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). * 2. There are only two valid generations: old (seq) and young (seq+1). * MEMCG_NR_GENS is set to three so that when reading the generation counter * locklessly, a stale value (seq-1) does not wraparound to young. */ #define MEMCG_NR_GENS 3 #define MEMCG_NR_BINS 8 struct lru_gen_memcg { /* the per-node memcg generation counter */ unsigned long seq; /* each memcg has one lru_gen_folio per node */ unsigned long nr_memcgs[MEMCG_NR_GENS]; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; /* protects the above */ spinlock_t lock; }; void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_lruvec(struct lruvec *lruvec); bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); #else /* !CONFIG_LRU_GEN */ static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) { } static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { return false; } static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } #endif /* CONFIG_LRU_GEN */ struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU * increases, the reclaim scan balance tips toward the other. */ unsigned long anon_cost; unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; #ifdef CONFIG_LRU_GEN_WALKS_MMU /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif #endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif struct zswap_lruvec_state zswap_lruvec_state; }; /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) /* Isolate unevictable pages */ #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) /* LRU Isolation modes. */ typedef unsigned __bitwise isolate_mode_t; enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, WMARK_PROMO, NR_WMARK }; /* * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 2 #else #define NR_PCP_THP 0 #endif #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) /* * Flags used in pcp->flags field. * * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the * previous page freeing. To avoid to drain PCP for an accident * high-order page freeing. * * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before * draining PCP for consecutive high-order pages freeing without * allocation if data cache slice of CPU is large enough. To reduce * zone lock contention and keep cache-hot pages reusing. */ #define PCPF_PREV_FREE_HIGH_ORDER BIT(0) #define PCPF_FREE_HIGH_BATCH BIT(1) struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int high_min; /* min high watermark */ int high_max; /* max high watermark */ int batch; /* chunk size for buddy add/remove */ u8 flags; /* protected by pcp->lock */ u8 alloc_factor; /* batch scaling factor during allocate */ #ifdef CONFIG_NUMA u8 expire; /* When 0, remote pagesets are drained */ #endif short free_count; /* consecutive free count */ /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; } ____cacheline_aligned_in_smp; struct per_cpu_zonestat { #ifdef CONFIG_SMP s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; s8 stat_threshold; #endif #ifdef CONFIG_NUMA /* * Low priority inaccurate counters that are only folded * on demand. Use a large type to avoid the overhead of * folding during refresh_cpu_vm_stats. */ unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #endif }; struct per_cpu_nodestat { s8 stat_threshold; s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Therefore, we do not allow * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and * faulted, they come from the right zone right away. However, it is * still possible that address space already has pages in * ZONE_MOVABLE at the time when pages are pinned (i.e. user has * touches that memory before pinning). In such case we migrate them * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. * 7. Memory-hotplug: when using memmap_on_memory and onlining the * memory to the MOVABLE zone, the vmemmap pages are also placed in * such zone. Such pages cannot be really moved around as they are * self-stored in the range, but they are treated as movable when * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives). */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES }; #ifndef __GENERATING_BOUNDS_H #define ASYNC_AND_SYNC 2 struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; unsigned long watermark_boost; unsigned long nr_reserved_highatomic; unsigned long nr_free_highatomic; /* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes. */ long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; #endif struct pglist_data *zone_pgdat; struct per_cpu_pages __percpu *per_cpu_pageset; struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access */ int pageset_high_min; int pageset_high_max; int pageset_batch; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * present_early_pages is present pages existing within the zone * located on memory available since early boot, excluding hotplugged * memory. * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * cma pages is present pages that are assigned for CMA use * (MIGRATE_CMA). * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/done(). Any reader who can't tolerant drift of * present_pages should use get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; #if defined(CONFIG_MEMORY_HOTPLUG) unsigned long present_early_pages; #endif #ifdef CONFIG_CMA unsigned long cma_pages; #endif const char *name; #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsigned long nr_isolate_pageblock; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif int initialized; /* Write-intensive fields used from the page allocator */ CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; #endif /* zone flags, see below */ unsigned long flags; /* Primarily protects free_area */ spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ CACHELINE_PADDING(_pad2_); /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsigned long percpu_drift_mark; #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; /* pfn where compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. * compact_order_failed is the minimum compaction failed order. */ unsigned int compact_considered; unsigned int compact_defer_shift; int compact_order_failed; #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif bool contiguous; CACHELINE_PADDING(_pad3_); /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ ZONE_BELOW_HIGH, /* zone is below high watermark. */ }; static inline unsigned long wmark_pages(const struct zone *z, enum zone_watermarks w) { return z->_watermark[w] + z->watermark_boost; } static inline unsigned long min_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_MIN); } static inline unsigned long low_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_LOW); } static inline unsigned long high_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_HIGH); } static inline unsigned long promo_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_PROMO); } static inline unsigned long zone_managed_pages(struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } static inline unsigned long zone_cma_pages(struct zone *zone) { #ifdef CONFIG_CMA return zone->cma_pages; #else return 0; #endif } static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } static inline bool zone_is_initialized(struct zone *zone) { return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) { return zone->spanned_pages == 0; } #ifndef BUILD_VDSO32_64 /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. */ /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) #define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) #define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) /* * Define the bit shifts to access each section. For non-existent * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) #define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF) ? \ SECTIONS_PGOFF : ZONES_PGOFF) #else #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF) ? \ NODES_PGOFF : ZONES_PGOFF) #endif #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) { ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } static inline enum zone_type folio_zonenum(const struct folio *folio) { return page_zonenum(&folio->page); } #ifdef CONFIG_ZONE_DEVICE static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } /* * Consecutive zone device pages should not be merged into the same sgl * or bvec segment with other types of pages or if they belong to different * pgmaps. Otherwise getting the pgmap of a given segment is not possible * without scanning the entire segment. This helper returns true either if * both pages are not zone device pages or both pages are zone device pages * with the same pgmap. */ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { if (is_zone_device_page(a) != is_zone_device_page(b)) return false; if (!is_zone_device_page(a)) return true; return a->pgmap == b->pgmap; } extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else static inline bool is_zone_device_page(const struct page *page) { return false; } static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { return true; } #endif static inline bool folio_is_zone_device(const struct folio *folio) { return is_zone_device_page(&folio->page); } static inline bool is_zone_movable_page(const struct page *page) { return page_zonenum(page) == ZONE_MOVABLE; } static inline bool folio_is_zone_movable(const struct folio *folio) { return folio_zonenum(folio) == ZONE_MOVABLE; } #endif /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ static inline bool zone_intersects(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) return false; if (start_pfn >= zone_end_pfn(zone) || start_pfn + nr_pages <= zone->zone_start_pfn) return false; return true; } /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. */ #define DEF_PRIORITY 12 /* Maximum number of zones on a zonelist */ #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) enum { ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE. */ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif MAX_ZONELISTS }; /* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables */ struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ }; /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; /* * The array of struct pages for flatmem. * It must be declared for SPARSEMEM as well because there are configurations * that rely on that. */ extern struct page *mem_map; #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; }; #endif #ifdef CONFIG_MEMORY_FAILURE /* * Per NUMA node memory failure handling statistics. */ struct memory_failure_stats { /* * Number of raw pages poisoned. * Cases not accounted: memory outside kernel control, offline page, * arch-specific memory_failure (SGX), hwpoison_filter() filtered * error events, and unpoison actions from hwpoison_unpoison. */ unsigned long total; /* * Recovery results of poisoned raw pages handled by memory_failure, * in sync with mf_result. * total = ignored + failed + delayed + recovered. * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. */ unsigned long ignored; unsigned long failed; unsigned long delayed; unsigned long recovered; }; #endif /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ typedef struct pglist_data { /* * node_zones contains just the zones for THIS node. Not all of the * zones may be populated, but it is the full list. It is referenced by * this node's node_zonelists as well as other node's node_zonelists. */ struct zone node_zones[MAX_NR_ZONES]; /* * node_zonelists contains references to all zones in all nodes. * Generally the first zones will be references to this node's * node_zones. */ struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. * Also synchronizes pgdat->first_deferred_pfn during deferred page * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; /* workqueues for throttling reclaim for different reasons. */ wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ #ifdef CONFIG_MEMORY_HOTPLUG struct mutex kswapd_lock; #endif struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations. */ unsigned long totalreserve_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ CACHELINE_PADDING(_pad1_); #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_NUMA_BALANCING /* start time in ms of current promote rate limit period */ unsigned int nbp_rl_start; /* number of promote candidate pages at start time of current rate limit period */ unsigned long nbp_rl_nr_cand; /* promote threshold in ms */ unsigned int nbp_threshold; /* start time in ms of current promote threshold adjustment period */ unsigned int nbp_th_start; /* * number of promote candidate pages at start time of current promote * threshold adjustment period */ unsigned long nbp_th_nr_cand; #endif /* Fields commonly accessed by the page reclaim scanner */ /* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. * * Use mem_cgroup_lruvec() to look up lruvecs. */ struct lruvec __lruvec; unsigned long flags; #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; /* lru_gen_folio list */ struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif #ifdef CONFIG_MEMORY_FAILURE struct memory_failure_stats mf_stats; #endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; } #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. */ enum meminit_context { MEMINIT_EARLY, MEMINIT_HOTPLUG, }; extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); extern void lruvec_init(struct lruvec *lruvec); static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) { #ifdef CONFIG_MEMCG return lruvec->pgdat; #else return container_of(lruvec, struct pglist_data, __lruvec); #endif } #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else static inline int local_memory_node(int node_id) { return node_id; }; #endif /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) #ifdef CONFIG_ZONE_DEVICE static inline bool zone_is_zone_device(struct zone *zone) { return zone_idx(zone) == ZONE_DEVICE; } #else static inline bool zone_is_zone_device(struct zone *zone) { return false; } #endif /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ static inline bool managed_zone(struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ static inline bool populated_zone(struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NUMA static inline int zone_to_nid(struct zone *zone) { return zone->node; } static inline void zone_set_nid(struct zone *zone, int nid) { zone->node = nid; } #else static inline int zone_to_nid(struct zone *zone) { return 0; } static inline void zone_set_nid(struct zone *zone, int nid) {} #endif extern int movable_zone; static inline int is_highmem_idx(enum zone_type idx) { #ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM || (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM)); #else return 0; #endif } /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise */ static inline int is_highmem(struct zone *zone) { return is_highmem_idx(zone_idx(zone)); } #ifdef CONFIG_ZONE_DMA bool has_managed_dma(void); #else static inline bool has_managed_dma(void) { return false; } #endif #ifndef CONFIG_NUMA extern struct pglist_data contig_page_data; static inline struct pglist_data *NODE_DATA(int nid) { return &contig_page_data; } #else /* CONFIG_NUMA */ #include <asm/mmzone.h> #endif /* !CONFIG_NUMA */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); extern struct zone *next_zone(struct zone *zone); /** * for_each_online_pgdat - helper macro to iterate over all online nodes * @pgdat: pointer to a pg_data_t variable */ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \ pgdat; \ pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones * @zone: pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in. */ #define for_each_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) #define for_each_populated_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) \ if (!populated_zone(zone)) \ ; /* do nothing */ \ else static inline struct zone *zonelist_zone(struct zoneref *zoneref) { return zoneref->zone; } static inline int zonelist_zone_idx(struct zoneref *zoneref) { return zoneref->zone_idx; } static inline int zonelist_node_idx(struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes); /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z: The cursor used as a starting point for the search * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. * * Return: the next zone at or below highest_zoneidx within the allowed * nodemask using a cursor within a zonelist as a starting point */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes); } /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist * @zonelist: The zonelist to search for a suitable zone * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. * * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. * * Return: Zoneref pointer for the first suitable zone found */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes) { return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); } /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone: The current zone in the iterator * @z: The current pointer within zonelist->_zonerefs being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * @nodemask: Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index * @zone: The current zone in the iterator * @z: The current pointer within zonelist->zones being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index. */ #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) /* Whether the 'nodes' are all movable nodes */ static inline bool movable_only_nodes(nodemask_t *nodes) { struct zonelist *zonelist; struct zoneref *z; int nid; if (nodes_empty(*nodes)) return false; /* * We can chose arbitrary node from the nodemask to get a * zonelist as they are interlinked. We just need to find * at least one zone that can satisfy kernel allocations. */ nid = first_node(*nodes); zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); return (!zonelist_zone(z)) ? true : false; } #ifdef CONFIG_SPARSEMEM #include <asm/sparsemem.h> #endif #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif #ifdef CONFIG_SPARSEMEM /* * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) #if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) { return pfn >> PFN_SECTION_SHIFT; } static inline unsigned long section_nr_to_pfn(unsigned long sec) { return sec << PFN_SECTION_SHIFT; } #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) #define SUBSECTION_SHIFT 21 #define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) #if SUBSECTION_SHIFT > SECTION_SIZE_BITS #error Subsection size exceeds section size #else #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) #endif #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { struct rcu_head rcu; #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif /* See declaration of similar field in struct zone */ unsigned long pageblock_flags[0]; }; void subsection_map_init(unsigned long pfn, unsigned long nr_pages); struct page; struct page_ext; struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. * (see sparse.c::sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. * (see sparse.c::memory_present()) * * Making it a UL at least makes someone do a cast * before using it wrong. */ unsigned long section_mem_map; struct mem_section_usage *usage; #ifdef CONFIG_PAGE_EXTENSION /* * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use * section. (see page_ext.h about this.) */ struct page_ext *page_ext; unsigned long pad; #endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense. */ }; #ifdef CONFIG_SPARSEMEM_EXTREME #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) #else #define SECTIONS_PER_ROOT 1 #endif #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif static inline unsigned long *section_to_usemap(struct mem_section *ms) { return ms->usage->pageblock_flags; } static inline struct mem_section *__nr_to_section(unsigned long nr) { unsigned long root = SECTION_NR_TO_ROOT(nr); if (unlikely(root >= NR_SECTION_ROOTS)) return NULL; #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section || !mem_section[root]) return NULL; #endif return &mem_section[root][nr & SECTION_ROOT_MASK]; } extern size_t mem_section_usage_size(void); /* * We use the lower bits of the mem_map pointer to store * a little bit of information. The pointer is calculated * as mem_map - section_nr_to_pfn(pnum). The result is * aligned to the minimum alignment of the two values: * 1. All mem_map arrays are page-aligned. * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT * lowest bits. PFN_SECTION_SHIFT is arch-specific * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available on all architectures. * However, we can exceed 6 bits on some other architectures except * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available * with the worst case of 64K pages on arm64) if we make sure the * exceeded bit is not applicable to powerpc. */ enum { SECTION_MARKED_PRESENT_BIT, SECTION_HAS_MEM_MAP_BIT, SECTION_IS_ONLINE_BIT, SECTION_IS_EARLY_BIT, #ifdef CONFIG_ZONE_DEVICE SECTION_TAINT_ZONE_DEVICE_BIT, #endif SECTION_MAP_LAST_BIT, }; #define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT) #define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT) #define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT) #define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT) #ifdef CONFIG_ZONE_DEVICE #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) #endif #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT static inline struct page *__section_mem_map_addr(struct mem_section *section) { unsigned long map = section->section_mem_map; map &= SECTION_MAP_MASK; return (struct page *)map; } static inline int present_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } static inline int present_section_nr(unsigned long nr) { return present_section(__nr_to_section(nr)); } static inline int valid_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } static inline int early_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } static inline int valid_section_nr(unsigned long nr) { return valid_section(__nr_to_section(nr)); } static inline int online_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } #ifdef CONFIG_ZONE_DEVICE static inline int online_device_section(struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } #else static inline int online_device_section(struct mem_section *section) { return 0; } #endif static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); } #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); } extern unsigned long __highest_present_section_nr; static inline int subsection_map_index(unsigned long pfn) { return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; } #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); struct mem_section_usage *usage = READ_ONCE(ms->usage); return usage ? test_bit(idx, usage->subsection_map) : 0; } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { return 1; } #endif #ifndef CONFIG_HAVE_ARCH_PFN_VALID /** * pfn_valid - check if there is a valid memory map entry for a PFN * @pfn: the page frame number to check * * Check if there is a valid memory map entry aka struct page for the @pfn. * Note, that availability of the memory map entry does not imply that * there is actual usable memory at that @pfn. The struct page may * represent a hole or an unusable page frame. * * Return: 1 for PFNs that have memory map entries and 0 otherwise */ static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; int ret; /* * Ensure the upper PAGE_SHIFT bits are clear in the * pfn. Else it might lead to false positives when * some of the upper bits are set, but the lower bits * match a valid pfn. */ if (PHYS_PFN(PFN_PHYS(pfn)) != pfn) return 0; if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); rcu_read_lock_sched(); if (!valid_section(ms)) { rcu_read_unlock_sched(); return 0; } /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ ret = early_section(ms) || pfn_section_valid(ms, pfn); rcu_read_unlock_sched(); return ret; } #endif static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; return present_section(__pfn_to_section(pfn)); } static inline unsigned long next_present_section_nr(unsigned long section_nr) { while (++section_nr <= __highest_present_section_nr) { if (present_section_nr(section_nr)) return section_nr; } return -1; } /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate * this restriction. */ #ifdef CONFIG_NUMA #define pfn_to_nid(pfn) \ ({ \ unsigned long __pfn_to_nid_pfn = (pfn); \ page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ }) #else #define pfn_to_nid(pfn) (0) #endif void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */
31 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 // SPDX-License-Identifier: GPL-2.0-or-later /* * cgroups support for the BFQ I/O scheduler. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/cgroup.h> #include <linux/ktime.h> #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/sbitmap.h> #include <linux/delay.h> #include "elevator.h" #include "bfq-iosched.h" #ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp) { int ret; ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); if (ret) return ret; atomic64_set(&stat->aux_cnt, 0); return 0; } static void bfq_stat_exit(struct bfq_stat *stat) { percpu_counter_destroy(&stat->cpu_cnt); } /** * bfq_stat_add - add a value to a bfq_stat * @stat: target bfq_stat * @val: value to add * * Add @val to @stat. The caller must ensure that IRQ on the same CPU * don't re-enter this function for the same counter. */ static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val) { percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); } /** * bfq_stat_read - read the current value of a bfq_stat * @stat: bfq_stat to read */ static inline uint64_t bfq_stat_read(struct bfq_stat *stat) { return percpu_counter_sum_positive(&stat->cpu_cnt); } /** * bfq_stat_reset - reset a bfq_stat * @stat: bfq_stat to reset */ static inline void bfq_stat_reset(struct bfq_stat *stat) { percpu_counter_set(&stat->cpu_cnt, 0); atomic64_set(&stat->aux_cnt, 0); } /** * bfq_stat_add_aux - add a bfq_stat into another's aux count * @to: the destination bfq_stat * @from: the source * * Add @from's count including the aux one to @to's aux count. */ static inline void bfq_stat_add_aux(struct bfq_stat *to, struct bfq_stat *from) { atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt), &to->aux_cnt); } /** * blkg_prfill_stat - prfill callback for bfq_stat * @sf: seq_file to print to * @pd: policy private data of interest * @off: offset to the bfq_stat in @pd * * prfill callback for printing a bfq_stat. */ static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) { return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off)); } /* bfqg stats flags */ enum bfqg_stats_flags { BFQG_stats_waiting = 0, BFQG_stats_idling, BFQG_stats_empty, }; #define BFQG_FLAG_FNS(name) \ static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ { \ stats->flags |= (1 << BFQG_stats_##name); \ } \ static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ { \ stats->flags &= ~(1 << BFQG_stats_##name); \ } \ static int bfqg_stats_##name(struct bfqg_stats *stats) \ { \ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ } \ BFQG_FLAG_FNS(waiting) BFQG_FLAG_FNS(idling) BFQG_FLAG_FNS(empty) #undef BFQG_FLAG_FNS /* This should be called with the scheduler lock held. */ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) { u64 now; if (!bfqg_stats_waiting(stats)) return; now = blk_time_get_ns(); if (now > stats->start_group_wait_time) bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); bfqg_stats_clear_waiting(stats); } /* This should be called with the scheduler lock held. */ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, struct bfq_group *curr_bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_waiting(stats)) return; if (bfqg == curr_bfqg) return; stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); } /* This should be called with the scheduler lock held. */ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { u64 now; if (!bfqg_stats_empty(stats)) return; now = blk_time_get_ns(); if (now > stats->start_empty_time) bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); bfqg_stats_clear_empty(stats); } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { bfq_stat_add(&bfqg->stats.dequeue, 1); } void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (blkg_rwstat_total(&stats->queued)) return; /* * group is already marked empty. This can happen if bfqq got new * request in parent group and moved to this group while being added * to service tree. Just ignore the event and move on. */ if (bfqg_stats_empty(stats)) return; stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); } void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_idling(stats)) { u64 now = blk_time_get_ns(); if (now > stats->start_idle_time) bfq_stat_add(&stats->idle_time, now - stats->start_idle_time); bfqg_stats_clear_idling(stats); } } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); } void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; bfq_stat_add(&stats->avg_queue_size_sum, blkg_rwstat_total(&stats->queued)); bfq_stat_add(&stats->avg_queue_size_samples, 1); bfqg_stats_update_group_wait_time(stats); } void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.queued, opf, 1); bfqg_stats_end_empty_time(&bfqg->stats); if (!(bfqq == bfqg->bfqd->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.queued, opf, -1); } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.merged, opf, 1); } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; u64 now = blk_time_get_ns(); if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, opf, now - io_start_time_ns); if (io_start_time_ns > start_time_ns) blkg_rwstat_add(&stats->wait_time, opf, io_start_time_ns - start_time_ns); } #else /* CONFIG_BFQ_CGROUP_DEBUG */ void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ #ifdef CONFIG_BFQ_GROUP_IOSCHED /* * blk-cgroup policy-related handlers * The following functions help in converting between blk-cgroup * internal structures and BFQ-specific structures. */ static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct bfq_group, pd) : NULL; } struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) { return pd_to_blkg(&bfqg->pd); } static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) { return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq)); } /* * bfq_group handlers * The following functions help in navigating the bfq_group hierarchy * by allowing to find the parent of a bfq_group or the bfq_group * associated to a bfq_queue. */ static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) { struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; return pblkg ? blkg_to_bfqg(pblkg) : NULL; } struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { struct bfq_entity *group_entity = bfqq->entity.parent; return group_entity ? container_of(group_entity, struct bfq_group, entity) : bfqq->bfqd->root_group; } /* * The following two functions handle get and put of a bfq_group by * wrapping the related blk-cgroup hooks. */ static void bfqg_get(struct bfq_group *bfqg) { refcount_inc(&bfqg->ref); } static void bfqg_put(struct bfq_group *bfqg) { if (refcount_dec_and_test(&bfqg->ref)) kfree(bfqg); } static void bfqg_and_blkg_get(struct bfq_group *bfqg) { /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ bfqg_get(bfqg); blkg_get(bfqg_to_blkg(bfqg)); } void bfqg_and_blkg_put(struct bfq_group *bfqg) { blkg_put(bfqg_to_blkg(bfqg)); bfqg_put(bfqg); } void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) { struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); if (!bfqg) return; blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); } /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { #ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); bfq_stat_reset(&stats->time); bfq_stat_reset(&stats->avg_queue_size_sum); bfq_stat_reset(&stats->avg_queue_size_samples); bfq_stat_reset(&stats->dequeue); bfq_stat_reset(&stats->group_wait_time); bfq_stat_reset(&stats->idle_time); bfq_stat_reset(&stats->empty_time); #endif } /* @to += @from */ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) { if (!to || !from) return; #ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); bfq_stat_add_aux(&from->time, &from->time); bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); bfq_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); bfq_stat_add_aux(&to->dequeue, &from->dequeue); bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time); bfq_stat_add_aux(&to->idle_time, &from->idle_time); bfq_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* * Transfer @bfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this bfqg after * it's gone. */ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) { struct bfq_group *parent; if (!bfqg) /* root_group */ return; parent = bfqg_parent(bfqg); lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock); if (unlikely(!parent)) return; bfqg_stats_add_aux(&parent->stats, &bfqg->stats); bfqg_stats_reset(&bfqg->stats); } void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->weight = entity->new_weight; entity->orig_weight = entity->new_weight; if (bfqq) { bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio_class = bfqq->new_ioprio_class; /* * Make sure that bfqg and its associated blkg do not * disappear before entity. */ bfqg_and_blkg_get(bfqg); } entity->parent = bfqg->my_entity; /* NULL for root group */ entity->sched_data = &bfqg->sched_data; } static void bfqg_stats_exit(struct bfqg_stats *stats) { blkg_rwstat_exit(&stats->bytes); blkg_rwstat_exit(&stats->ios); #ifdef CONFIG_BFQ_CGROUP_DEBUG blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); bfq_stat_exit(&stats->time); bfq_stat_exit(&stats->avg_queue_size_sum); bfq_stat_exit(&stats->avg_queue_size_samples); bfq_stat_exit(&stats->dequeue); bfq_stat_exit(&stats->group_wait_time); bfq_stat_exit(&stats->idle_time); bfq_stat_exit(&stats->empty_time); #endif } static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) goto error; #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || bfq_stat_init(&stats->time, gfp) || bfq_stat_init(&stats->avg_queue_size_sum, gfp) || bfq_stat_init(&stats->avg_queue_size_samples, gfp) || bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || bfq_stat_init(&stats->empty_time, gfp)) goto error; #endif return 0; error: bfqg_stats_exit(stats); return -ENOMEM; } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) { return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; } static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) { return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); } static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) { struct bfq_group_data *bgd; bgd = kzalloc(sizeof(*bgd), gfp); if (!bgd) return NULL; bgd->weight = CGROUP_WEIGHT_DFL; return &bgd->pd; } static void bfq_cpd_free(struct blkcg_policy_data *cpd) { kfree(cpd_to_bfqgd(cpd)); } static struct blkg_policy_data *bfq_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct bfq_group *bfqg; bfqg = kzalloc_node(sizeof(*bfqg), gfp, disk->node_id); if (!bfqg) return NULL; if (bfqg_stats_init(&bfqg->stats, gfp)) { kfree(bfqg); return NULL; } /* see comments in bfq_bic_update_cgroup for why refcounting */ refcount_set(&bfqg->ref, 1); return &bfqg->pd; } static void bfq_pd_init(struct blkg_policy_data *pd) { struct blkcg_gq *blkg = pd_to_blkg(pd); struct bfq_group *bfqg = blkg_to_bfqg(blkg); struct bfq_data *bfqd = blkg->q->elevator->elevator_data; struct bfq_entity *entity = &bfqg->entity; struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; entity->last_bfqq_created = NULL; bfqg->my_entity = entity; /* * the root_group's will be set to NULL * in bfq_init_queue() */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; bfqg->num_queues_with_pending_reqs = 0; bfqg->rq_pos_tree = RB_ROOT; } static void bfq_pd_free(struct blkg_policy_data *pd) { struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_exit(&bfqg->stats); bfqg_put(bfqg); } static void bfq_pd_reset_stats(struct blkg_policy_data *pd) { struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_reset(&bfqg->stats); } static void bfq_group_set_parent(struct bfq_group *bfqg, struct bfq_group *parent) { struct bfq_entity *entity; entity = &bfqg->entity; entity->parent = parent->my_entity; entity->sched_data = &parent->sched_data; } static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) { struct bfq_group *parent; struct bfq_entity *entity; /* * Update chain of bfq_groups as we might be handling a leaf group * which, along with some of its relatives, has not been hooked yet * to the private hierarchy of BFQ. */ entity = &bfqg->entity; for_each_entity(entity) { struct bfq_group *curr_bfqg = container_of(entity, struct bfq_group, entity); if (curr_bfqg != bfqd->root_group) { parent = bfqg_parent(curr_bfqg); if (!parent) parent = bfqd->root_group; bfq_group_set_parent(curr_bfqg, parent); } } } struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; struct bfq_group *bfqg; while (blkg) { if (!blkg->online) { blkg = blkg->parent; continue; } bfqg = blkg_to_bfqg(blkg); if (bfqg->pd.online) { bio_associate_blkg_from_css(bio, &blkg->blkcg->css); return bfqg; } blkg = blkg->parent; } bio_associate_blkg_from_css(bio, &bfqg_to_blkg(bfqd->root_group)->blkcg->css); return bfqd->root_group; } /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. * @bfqq: the queue to move. * @bfqg: the group to move to. * * Move @bfqq to @bfqg, deactivating it from its old group and reactivating * it on the new one. Avoid putting the entity on the old group idle tree. * * Must be called under the scheduler lock, to make sure that the blkg * owning @bfqg does not disappear (see comments in * bfq_bic_update_cgroup on guaranteeing the consistency of blkg * objects). */ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) { struct bfq_entity *entity = &bfqq->entity; struct bfq_group *old_parent = bfqq_group(bfqq); bool has_pending_reqs = false; /* * No point to move bfqq to the same group, which can happen when * root group is offlined */ if (old_parent == bfqg) return; /* * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group * until elevator exit. */ if (bfqq == &bfqd->oom_bfqq) return; /* * Get extra reference to prevent bfqq from being freed in * next possible expire or deactivate. */ bfqq->ref++; if (entity->in_groups_with_pending_reqs) { has_pending_reqs = true; bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); } /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we * need to remove bfqq explicitly with bfq_deactivate_bfqq, as * we do below. */ if (bfqq == bfqd->in_service_queue) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); if (bfq_bfqq_busy(bfqq)) bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st_or_in_serv) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(old_parent); bfq_reassign_last_bfqq(bfqq, NULL); entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ bfqg_and_blkg_get(bfqg); if (has_pending_reqs) bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); if (bfq_bfqq_busy(bfqq)) { if (unlikely(!bfqd->nonrot_with_queueing)) bfq_pos_tree_add_move(bfqd, bfqq); bfq_activate_bfqq(bfqd, bfqq); } if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } static void bfq_sync_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *sync_bfqq, struct bfq_io_cq *bic, struct bfq_group *bfqg, unsigned int act_idx) { struct bfq_queue *bfqq; if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { /* We are the only user of this bfqq, just move it */ if (sync_bfqq->entity.sched_data != &bfqg->sched_data) bfq_bfqq_move(bfqd, sync_bfqq, bfqg); return; } /* * The queue was merged to a different queue. Check * that the merge chain still belongs to the same * cgroup. */ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) if (bfqq->entity.sched_data != &bfqg->sched_data) break; if (bfqq) { /* * Some queue changed cgroup so the merge is not valid * anymore. We cannot easily just cancel the merge (by * clearing new_bfqq) as there may be other processes * using this queue and holding refs to all queues * below sync_bfqq->new_bfqq. Similarly if the merge * already happened, we need to detach from bfqq now * so that we cannot merge bio to a request from the * old cgroup. */ bfq_put_cooperator(sync_bfqq); bic_set_bfqq(bic, NULL, true, act_idx); bfq_release_process_ref(bfqd, sync_bfqq); } } /** * __bfq_bic_change_cgroup - move @bic to @bfqg. * @bfqd: the queue descriptor. * @bic: the bic to move. * @bfqg: the group to move to. * * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) */ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_group *bfqg) { unsigned int act_idx; for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx); if (async_bfqq && async_bfqq->entity.sched_data != &bfqg->sched_data) { bic_set_bfqq(bic, NULL, false, act_idx); bfq_release_process_ref(bfqd, async_bfqq); } if (sync_bfqq) bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx); } } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio); uint64_t serial_nr; serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) return; /* * New cgroup for this process. Make sure it is linked to bfq internal * cgroup hierarchy. */ bfq_link_bfqg(bfqd, bfqg); __bfq_bic_change_cgroup(bfqd, bic, bfqg); bic->blkcg_serial_nr = serial_nr; } /** * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. * @st: the service tree being flushed. */ static void bfq_flush_idle_tree(struct bfq_service_tree *st) { struct bfq_entity *entity = st->first_idle; for (; entity ; entity = st->first_idle) __bfq_deactivate_entity(entity, false); } /** * bfq_reparent_leaf_entity - move leaf entity to the root_group. * @bfqd: the device data structure with the root group. * @entity: the entity to move, if entity is a leaf; or the parent entity * of an active leaf entity to move, if entity is not a leaf. * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_entity *entity, int ioprio_class) { struct bfq_queue *bfqq; struct bfq_entity *child_entity = entity; while (child_entity->my_sched_data) { /* leaf not reached yet */ struct bfq_sched_data *child_sd = child_entity->my_sched_data; struct bfq_service_tree *child_st = child_sd->service_tree + ioprio_class; struct rb_root *child_active = &child_st->active; child_entity = bfq_entity_of(rb_first(child_active)); if (!child_entity) child_entity = child_sd->in_service_entity; } bfqq = bfq_entity_to_bfqq(child_entity); bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); } /** * bfq_reparent_active_queues - move to the root group all active queues. * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. * @st: the service tree to start the search from. * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_active_queues(struct bfq_data *bfqd, struct bfq_group *bfqg, struct bfq_service_tree *st, int ioprio_class) { struct rb_root *active = &st->active; struct bfq_entity *entity; while ((entity = bfq_entity_of(rb_first(active)))) bfq_reparent_leaf_entity(bfqd, entity, ioprio_class); if (bfqg->sched_data.in_service_entity) bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.in_service_entity, ioprio_class); } /** * bfq_pd_offline - deactivate the entity associated with @pd, * and reparent its children entities. * @pd: descriptor of the policy going offline. * * blkio already grabs the queue_lock for us, so no need to use * RCU-based magic */ static void bfq_pd_offline(struct blkg_policy_data *pd) { struct bfq_service_tree *st; struct bfq_group *bfqg = pd_to_bfqg(pd); struct bfq_data *bfqd = bfqg->bfqd; struct bfq_entity *entity = bfqg->my_entity; unsigned long flags; int i; spin_lock_irqsave(&bfqd->lock, flags); if (!entity) /* root group */ goto put_async_queues; /* * Empty all service_trees belonging to this group before * deactivating the group itself. */ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { st = bfqg->sched_data.service_tree + i; /* * It may happen that some queues are still active * (busy) upon group destruction (if the corresponding * processes have been forced to terminate). We move * all the leaf entities corresponding to these queues * to the root_group. * Also, it may happen that the group has an entity * in service, which is disconnected from the active * tree: it must be moved, too. * There is no need to put the sync queues, as the * scheduler has taken no reference. */ bfq_reparent_active_queues(bfqd, bfqg, st, i); /* * The idle tree may still contain bfq_queues * belonging to exited task because they never * migrated to a different cgroup from the one being * destroyed now. In addition, even * bfq_reparent_active_queues() may happen to add some * entities to the idle tree. It happens if, in some * of the calls to bfq_bfqq_move() performed by * bfq_reparent_active_queues(), the queue to move is * empty and gets expired. */ bfq_flush_idle_tree(st); } __bfq_deactivate_entity(entity, false); put_async_queues: bfq_put_async_queues(bfqd, bfqg); spin_unlock_irqrestore(&bfqd->lock, flags); /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so * that they don't get lost. If IOs complete after this point, the * stats for them will be lost. Oh well... */ bfqg_stats_xfer_dead(bfqg); } void bfq_end_wr_async(struct bfq_data *bfqd) { struct blkcg_gq *blkg; list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); bfq_end_wr_async_queues(bfqd, bfqg); } bfq_end_wr_async_queues(bfqd, bfqd->root_group); } static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); unsigned int val = 0; if (bfqgd) val = bfqgd->weight; seq_printf(sf, "%u\n", val); return 0; } static u64 bfqg_prfill_weight_device(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = pd_to_bfqg(pd); if (!bfqg->entity.dev_weight) return 0; return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight); } static int bfq_io_show_weight(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); seq_printf(sf, "default %u\n", bfqgd->weight); blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device, &blkcg_policy_bfq, 0, false); return 0; } static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight) { weight = dev_weight ?: weight; bfqg->entity.dev_weight = dev_weight; /* * Setting the prio_changed flag of the entity * to 1 with new_weight == weight would re-set * the value of the weight to its ioprio mapping. * Set the flag only if necessary. */ if ((unsigned short)weight != bfqg->entity.new_weight) { bfqg->entity.new_weight = (unsigned short)weight; /* * Make sure that the above new value has been * stored in bfqg->entity.new_weight before * setting the prio_changed flag. In fact, * this flag may be read asynchronously (in * critical sections protected by a different * lock than that held here), and finding this * flag set may cause the execution of the code * for updating parameters whose value may * depend also on bfqg->entity.new_weight (in * __bfq_entity_update_weight_prio). * This barrier makes sure that the new value * of bfqg->entity.new_weight is correctly * seen in that code. */ smp_wmb(); bfqg->entity.prio_changed = 1; } } static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); struct blkcg_gq *blkg; int ret = -ERANGE; if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) return ret; ret = 0; spin_lock_irq(&blkcg->lock); bfqgd->weight = (unsigned short)val; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); if (bfqg) bfq_group_set_weight(bfqg, val, 0); } spin_unlock_irq(&blkcg->lock); return ret; } static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int ret; struct blkg_conf_ctx ctx; struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct bfq_group *bfqg; u64 v; blkg_conf_init(&ctx, buf); ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx); if (ret) goto out; if (sscanf(ctx.body, "%llu", &v) == 1) { /* require "default" on dfl */ ret = -ERANGE; if (!v) goto out; } else if (!strcmp(strim(ctx.body), "default")) { v = 0; } else { ret = -EINVAL; goto out; } bfqg = blkg_to_bfqg(ctx.blkg); ret = -ERANGE; if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) { bfq_group_set_weight(bfqg, bfqg->entity.weight, v); ret = 0; } out: blkg_conf_exit(&ctx); return ret ?: nbytes; } static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { char *endp; int ret; u64 v; buf = strim(buf); /* "WEIGHT" or "default WEIGHT" sets the default weight */ v = simple_strtoull(buf, &endp, 0); if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { ret = bfq_io_set_weight_legacy(of_css(of), NULL, v); return ret ?: nbytes; } return bfq_io_set_device_weight(of, buf, nbytes, off); } static int bfqg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample sum; blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } #ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkcg_gq *blkg = pd_to_blkg(pd); struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; lockdep_assert_held(&blkg->q->queue_lock); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { struct bfq_stat *stat; if (!pos_blkg->online) continue; stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off; sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); return __blkg_prfill_u64(sf, pd, sum); } static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_stat_recursive, &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg); u64 sum = blkg_rwstat_total(&bfqg->stats.bytes); return __blkg_prfill_u64(sf, pd, sum >> 9); } static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); return 0; } static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample tmp; blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq, offsetof(struct bfq_group, stats.bytes), &tmp); return __blkg_prfill_u64(sf, pd, (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); } static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, false); return 0; } static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = pd_to_bfqg(pd); u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples); u64 v = 0; if (samples) { v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum); v = div64_u64(v, samples); } __blkg_prfill_u64(sf, pd, v); return 0; } /* print avg_queue_size */ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, 0, false); return 0; } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { int ret; ret = blkcg_activate_policy(bfqd->queue->disk, &blkcg_policy_bfq); if (ret) return NULL; return blkg_to_bfqg(bfqd->queue->root_blkg); } struct blkcg_policy blkcg_policy_bfq = { .dfl_cftypes = bfq_blkg_files, .legacy_cftypes = bfq_blkcg_legacy_files, .cpd_alloc_fn = bfq_cpd_alloc, .cpd_free_fn = bfq_cpd_free, .pd_alloc_fn = bfq_pd_alloc, .pd_init_fn = bfq_pd_init, .pd_offline_fn = bfq_pd_offline, .pd_free_fn = bfq_pd_free, .pd_reset_stats_fn = bfq_pd_reset_stats, }; struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight_legacy, .write_u64 = bfq_io_set_weight_legacy, }, { .name = "bfq.weight_device", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight, .write = bfq_io_set_weight, }, /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.io_service_bytes", .private = offsetof(struct bfq_group, stats.bytes), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_serviced", .private = offsetof(struct bfq_group, stats.ios), .seq_show = bfqg_print_rwstat, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), .seq_show = bfqg_print_stat, }, { .name = "bfq.sectors", .seq_show = bfqg_print_stat_sectors, }, { .name = "bfq.io_service_time", .private = offsetof(struct bfq_group, stats.service_time), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_wait_time", .private = offsetof(struct bfq_group, stats.wait_time), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_merged", .private = offsetof(struct bfq_group, stats.merged), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_queued", .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat, }, #endif /* CONFIG_BFQ_CGROUP_DEBUG */ /* the same statistics which cover the bfqg and its descendants */ { .name = "bfq.io_service_bytes_recursive", .private = offsetof(struct bfq_group, stats.bytes), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_serviced_recursive", .private = offsetof(struct bfq_group, stats.ios), .seq_show = bfqg_print_rwstat_recursive, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time_recursive", .private = offsetof(struct bfq_group, stats.time), .seq_show = bfqg_print_stat_recursive, }, { .name = "bfq.sectors_recursive", .seq_show = bfqg_print_stat_sectors_recursive, }, { .name = "bfq.io_service_time_recursive", .private = offsetof(struct bfq_group, stats.service_time), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_wait_time_recursive", .private = offsetof(struct bfq_group, stats.wait_time), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_merged_recursive", .private = offsetof(struct bfq_group, stats.merged), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_queued_recursive", .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.avg_queue_size", .seq_show = bfqg_print_avg_queue_size, }, { .name = "bfq.group_wait_time", .private = offsetof(struct bfq_group, stats.group_wait_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.idle_time", .private = offsetof(struct bfq_group, stats.idle_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.empty_time", .private = offsetof(struct bfq_group, stats.empty_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.dequeue", .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, #endif /* CONFIG_BFQ_CGROUP_DEBUG */ { } /* terminate */ }; struct cftype bfq_blkg_files[] = { { .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight, .write = bfq_io_set_weight, }, {} /* terminate */ }; #else /* CONFIG_BFQ_GROUP_IOSCHED */ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) {} void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->weight = entity->new_weight; entity->orig_weight = entity->new_weight; if (bfqq) { bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio_class = bfqq->new_ioprio_class; } entity->sched_data = &bfqg->sched_data; } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} void bfq_end_wr_async(struct bfq_data *bfqd) { bfq_end_wr_async_queues(bfqd, bfqd->root_group); } struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { return bfqd->root_group; } struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { return bfqq->bfqd->root_group; } void bfqg_and_blkg_put(struct bfq_group *bfqg) {} struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { struct bfq_group *bfqg; int i; bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); if (!bfqg) return NULL; for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; return bfqg; } #endif /* CONFIG_BFQ_GROUP_IOSCHED */
11 140 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Checksumming functions for IP, TCP, UDP and so on * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Borrows very liberally from tcp.c and ip.c, see those * files for more names. */ #ifndef _CHECKSUM_H #define _CHECKSUM_H #include <linux/errno.h> #include <asm/types.h> #include <asm/byteorder.h> #include <asm/checksum.h> #if !defined(_HAVE_ARCH_COPY_AND_CSUM_FROM_USER) || !defined(HAVE_CSUM_COPY_USER) #include <linux/uaccess.h> #endif #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { if (copy_from_user(dst, src, len)) return 0; return csum_partial(dst, len, ~0U); } #endif #ifndef HAVE_CSUM_COPY_USER static __always_inline __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len) { __wsum sum = csum_partial(src, len, ~0U); if (copy_to_user(dst, src, len) == 0) return sum; return 0; } #endif #ifndef _HAVE_ARCH_CSUM_AND_COPY static __always_inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { memcpy(dst, src, len); return csum_partial(dst, len, 0); } #endif #ifndef HAVE_ARCH_CSUM_ADD static __always_inline __wsum csum_add(__wsum csum, __wsum addend) { u32 res = (__force u32)csum; res += (__force u32)addend; return (__force __wsum)(res + (res < (__force u32)addend)); } #endif static __always_inline __wsum csum_sub(__wsum csum, __wsum addend) { return csum_add(csum, ~addend); } static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend) { u16 res = (__force u16)csum; res += (__force u16)addend; return (__force __sum16)(res + (res < (__force u16)addend)); } static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } #ifndef HAVE_ARCH_CSUM_SHIFT static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ if (offset & 1) return (__force __wsum)ror32((__force u32)sum, 8); return sum; } #endif static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { return csum_add(csum, csum_shift(csum2, offset)); } static __always_inline __wsum csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) { return csum_block_add(csum, csum2, offset); } static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); } static __always_inline __wsum csum_unfold(__sum16 n) { return (__force __wsum)n; } static __always_inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) { return csum_partial(buff, len, sum); } #define CSUM_MANGLED_0 ((__force __sum16)0xffff) static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); } static __always_inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); *sum = csum_fold(csum_add(tmp, (__force __wsum)to)); } /* Implements RFC 1624 (Incremental Internet Checksum) * 3. Discussion states : * HC' = ~(~HC + ~m + m') * m : old value of a 16bit field * m' : new value of a 16bit field */ static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) { *csum = csum_add(csum_sub(*csum, old), new); } static inline unsigned short csum_from32to16(unsigned int sum) { sum += (sum >> 16) | (sum << 16); return (unsigned short)(sum >> 16); } struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr); static __always_inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, __be16 from, __be16 to, bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); } static __always_inline __wsum remcsum_adjust(void *ptr, __wsum csum, int start, int offset) { __sum16 *psum = (__sum16 *)(ptr + offset); __wsum delta; /* Subtract out checksum up to start */ csum = csum_sub(csum, csum_partial(ptr, start, 0)); /* Set derived checksum in packet */ delta = csum_sub((__force __wsum)csum_fold(csum), (__force __wsum)*psum); *psum = csum_fold(csum); return delta; } static __always_inline void remcsum_unadjust(__sum16 *psum, __wsum delta) { *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum)); } static __always_inline __wsum wsum_negate(__wsum val) { return (__force __wsum)-((__force u32)val); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt_private.h * * Copyright (C) 2015, Google, Inc. * * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. * Heavily modified since then. */ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt.h> #include <linux/siphash.h> #include <crypto/hash.h> #include <linux/blk-crypto.h> #define CONST_STRLEN(str) (sizeof(str) - 1) #define FSCRYPT_FILE_NONCE_SIZE 16 /* * Minimum size of an fscrypt master key. Note: a longer key will be required * if ciphers with a 256-bit security strength are used. This is just the * absolute minimum, which applies when only 128-bit encryption is used. */ #define FSCRYPT_MIN_KEY_SIZE 16 #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 /* Keep this in sync with include/uapi/linux/fscrypt.h */ #define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; struct fscrypt_context_v2 { u8 version; /* FSCRYPT_CONTEXT_V2 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 log2_data_unit_size; u8 __reserved[3]; u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; /* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each * encrypted file usually in a hidden extended attribute. It contains the * fields from the fscrypt_policy, in order to identify the encryption algorithm * and key with which the file is encrypted. It also contains a nonce that was * randomly generated by fscrypt itself; this is used as KDF input or as a tweak * to cause different files to be encrypted differently. */ union fscrypt_context { u8 version; struct fscrypt_context_v1 v1; struct fscrypt_context_v2 v2; }; /* * Return the size expected for the given fscrypt_context based on its version * number, or 0 if the context version is unrecognized. */ static inline int fscrypt_context_size(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: BUILD_BUG_ON(sizeof(ctx->v1) != 28); return sizeof(ctx->v1); case FSCRYPT_CONTEXT_V2: BUILD_BUG_ON(sizeof(ctx->v2) != 40); return sizeof(ctx->v2); } return 0; } /* Check whether an fscrypt_context has a recognized version number and size */ static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx, int ctx_size) { return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx); } /* Retrieve the context's nonce, assuming the context was already validated */ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: return ctx->v1.nonce; case FSCRYPT_CONTEXT_V2: return ctx->v2.nonce; } WARN_ON_ONCE(1); return NULL; } union fscrypt_policy { u8 version; struct fscrypt_policy_v1 v1; struct fscrypt_policy_v2 v2; }; /* * Return the size expected for the given fscrypt_policy based on its version * number, or 0 if the policy version is unrecognized. */ static inline int fscrypt_policy_size(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return sizeof(policy->v1); case FSCRYPT_POLICY_V2: return sizeof(policy->v2); } return 0; } /* Return the contents encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_contents_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.contents_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.contents_encryption_mode; } BUG(); } /* Return the filenames encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_fnames_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.filenames_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.filenames_encryption_mode; } BUG(); } /* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */ static inline u8 fscrypt_policy_flags(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.flags; case FSCRYPT_POLICY_V2: return policy->v2.flags; } BUG(); } static inline int fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { return policy->log2_data_unit_size ?: inode->i_blkbits; } static inline int fscrypt_policy_du_bits(const union fscrypt_policy *policy, const struct inode *inode) { switch (policy->version) { case FSCRYPT_POLICY_V1: return inode->i_blkbits; case FSCRYPT_POLICY_V2: return fscrypt_policy_v2_du_bits(&policy->v2, inode); } BUG(); } /* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ struct fscrypt_symlink_data { __le16 len; char encrypted_path[]; } __packed; /** * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption * @tfm: crypto API transform object * @blk_key: key for blk-crypto * * Normally only one of the fields will be non-NULL. */ struct fscrypt_prepared_key { struct crypto_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT struct blk_crypto_key *blk_key; #endif }; /* * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is * allocated and stored in ->i_crypt_info. Once created, it remains until the * inode is evicted. */ struct fscrypt_inode_info { /* The key in a form prepared for actual encryption/decryption */ struct fscrypt_prepared_key ci_enc_key; /* True if ci_enc_key should be freed when this struct is freed */ u8 ci_owns_key : 1; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT /* * True if this inode will use inline encryption (blk-crypto) instead of * the traditional filesystem-layer encryption. */ u8 ci_inlinecrypt : 1; #endif /* True if ci_dirhash_key is initialized */ u8 ci_dirhash_key_initialized : 1; /* * log2 of the data unit size (granularity of contents encryption) of * this file. This is computable from ci_policy and ci_inode but is * cached here for efficiency. Only used for regular files. */ u8 ci_data_unit_bits; /* Cached value: log2 of number of data units per FS block */ u8 ci_data_units_per_block_bits; /* Hashed inode number. Only set for IV_INO_LBLK_32 */ u32 ci_hashed_ino; /* * Encryption mode used for this inode. It corresponds to either the * contents or filenames encryption mode, depending on the inode type. */ struct fscrypt_mode *ci_mode; /* Back-pointer to the inode */ struct inode *ci_inode; /* * The master key with which this inode was unlocked (decrypted). This * will be NULL if the master key was found in a process-subscribed * keyring rather than in the filesystem-level keyring. */ struct fscrypt_master_key *ci_master_key; /* * Link in list of inodes that were unlocked with the master key. * Only used when ->ci_master_key is set. */ struct list_head ci_master_key_link; /* * If non-NULL, then encryption is done using the master key directly * and ci_enc_key will equal ci_direct_key->dk_key. */ struct fscrypt_direct_key *ci_direct_key; /* * This inode's hash key for filenames. This is a 128-bit SipHash-2-4 * key. This is only set for directories that use a keyed dirhash over * the plaintext filenames -- currently just casefolded directories. */ siphash_key_t ci_dirhash_key; /* The encryption policy used by this inode */ union fscrypt_policy ci_policy; /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE]; }; typedef enum { FS_DECRYPT = 0, FS_ENCRYPT, } fscrypt_direction_t; /* crypto.c */ extern struct kmem_cache *fscrypt_inode_info_cachep; int fscrypt_initialize(struct super_block *sb); int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, fscrypt_direction_t rw, u64 index, struct page *src_page, struct page *dest_page, unsigned int len, unsigned int offs, gfp_t gfp_flags); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__) #define fscrypt_err(inode, fmt, ...) \ fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) #define FSCRYPT_MAX_IV_SIZE 32 union fscrypt_iv { struct { /* zero-based index of data unit within the file */ __le64 index; /* per-file nonce; only set in DIRECT_KEY mode */ u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; u8 raw[FSCRYPT_MAX_IV_SIZE]; __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)]; }; void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, const struct fscrypt_inode_info *ci); /* * Return the number of bits used by the maximum file data unit index that is * possible on the given filesystem, using the given log2 data unit size. */ static inline int fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits) { return fls64(sb->s_maxbytes - 1) - du_bits; } /* fname.c */ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); /* hkdf.c */ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as * the first byte of the HKDF application-specific info string to guarantee that * info strings are never repeated between contexts. This ensures that all HKDF * outputs are unique and cryptographically isolated, i.e. knowledge of one * output doesn't reveal another. */ #define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */ #define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */ #define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */ #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */ #define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen); void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci); static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return ci->ci_inlinecrypt; } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); /* * Check whether the crypto transform or blk-crypto key has been allocated in * @prep_key, depending on which encryption implementation the file will use. */ static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) { /* * The two smp_load_acquire()'s here pair with the smp_store_release()'s * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key(). * I.e., in some cases (namely, if this prep_key is a per-mode * encryption key) another task can publish blk_key or tfm concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ if (fscrypt_using_inline_encryption(ci)) return smp_load_acquire(&prep_key->blk_key) != NULL; return smp_load_acquire(&prep_key->tfm) != NULL; } #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) { return 0; } static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return false; } static inline int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci) { WARN_ON_ONCE(1); return -EOPNOTSUPP; } static inline void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { } static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) { return smp_load_acquire(&prep_key->tfm) != NULL; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /* keyring.c */ /* * fscrypt_master_key_secret - secret key material of an in-use master key */ struct fscrypt_master_key_secret { /* * For v2 policy keys: HKDF context keyed by this master key. * For v1 policy keys: not set (hkdf.hmac_tfm == NULL). */ struct fscrypt_hkdf hkdf; /* * Size of the raw key in bytes. This remains set even if ->raw was * zeroized due to no longer being needed. I.e. we still remember the * size of the key even if we don't need to remember the key itself. */ u32 size; /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ u8 raw[FSCRYPT_MAX_KEY_SIZE]; } __randomize_layout; /* * fscrypt_master_key - an in-use master key * * This represents a master encryption key which has been added to the * filesystem. There are three high-level states that a key can be in: * * FSCRYPT_KEY_STATUS_PRESENT * Key is fully usable; it can be used to unlock inodes that are encrypted * with it (this includes being able to create new inodes). ->mk_present * indicates whether the key is in this state. ->mk_secret exists, the key * is in the keyring, and ->mk_active_refs > 0 due to ->mk_present. * * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED * Removal of this key has been initiated, but some inodes that were * unlocked with it are still in-use. Like ABSENT, ->mk_secret is wiped, * and the key can no longer be used to unlock inodes. Unlike ABSENT, the * key is still in the keyring; ->mk_decrypted_inodes is nonempty; and * ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes. * * This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty, * or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key. * * FSCRYPT_KEY_STATUS_ABSENT * Key is fully removed. The key is no longer in the keyring, * ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is * wiped, and the key can no longer be used to unlock inodes. */ struct fscrypt_master_key { /* * Link in ->s_master_keys->key_hashtable. * Only valid if ->mk_active_refs > 0. */ struct hlist_node mk_node; /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */ struct rw_semaphore mk_sem; /* * Active and structural reference counts. An active ref guarantees * that the struct continues to exist, continues to be in the keyring * ->s_master_keys, and that any embedded subkeys (e.g. * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * * There is one active ref associated with ->mk_present being true, and * one active ref for each inode in ->mk_decrypted_inodes. * * There is one structural ref associated with the active refcount being * nonzero. Finding a key in the keyring also takes a structural ref, * which is then held temporarily while the key is operated on. */ refcount_t mk_active_refs; refcount_t mk_struct_refs; struct rcu_head mk_rcu_head; /* * The secret key material. Wiped as soon as it is no longer needed; * for details, see the fscrypt_master_key struct comment. * * Locking: protected by ->mk_sem. */ struct fscrypt_master_key_secret mk_secret; /* * For v1 policy keys: an arbitrary key descriptor which was assigned by * userspace (->descriptor). * * For v2 policy keys: a cryptographic hash of this key (->identifier). */ struct fscrypt_key_specifier mk_spec; /* * Keyring which contains a key of type 'key_type_fscrypt_user' for each * user who has added this key. Normally each key will be added by just * one user, but it's possible that multiple users share a key, and in * that case we need to keep track of those users so that one user can't * remove the key before the others want it removed too. * * This is NULL for v1 policy keys; those can only be added by root. * * Locking: protected by ->mk_sem. (We don't just rely on the keyrings * subsystem semaphore ->mk_users->sem, as we need support for atomic * search+insert along with proper synchronization with other fields.) */ struct key *mk_users; /* * List of inodes that were unlocked using this key. This allows the * inodes to be evicted efficiently if the key is removed. */ struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; /* * Per-mode encryption keys for the various types of encryption policies * that use them. Allocated and derived on-demand. */ struct fscrypt_prepared_key mk_direct_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[FSCRYPT_MODE_MAX + 1]; /* Hash key for inode numbers. Initialized only when needed. */ siphash_key_t mk_ino_hash_key; bool mk_ino_hash_key_initialized; /* * Whether this key is in the "present" state, i.e. fully usable. For * details, see the fscrypt_master_key struct comment. * * Locking: protected by ->mk_sem, but can be read locklessly using * READ_ONCE(). Writers must use WRITE_ONCE() when concurrent readers * are possible. */ bool mk_present; } __randomize_layout; static inline const char *master_key_spec_type( const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return "descriptor"; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return "identifier"; } return "[unknown]"; } static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return FSCRYPT_KEY_DESCRIPTOR_SIZE; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return FSCRYPT_KEY_IDENTIFIER_SIZE; } return 0; } void fscrypt_put_master_key(struct fscrypt_master_key *mk); void fscrypt_put_master_key_activeref(struct super_block *sb, struct fscrypt_master_key *mk); struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_add_test_dummy_key(struct super_block *sb, struct fscrypt_key_specifier *key_spec); int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int __init fscrypt_init_keyring(void); /* keysetup.c */ struct fscrypt_mode { const char *friendly_name; const char *cipher_str; int keysize; /* key size in bytes */ int security_strength; /* security strength in bytes */ int ivsize; /* IV size in bytes */ int logged_cryptoapi_impl; int logged_blk_crypto_native; int logged_blk_crypto_fallback; enum blk_crypto_mode_num blk_crypto_mode; }; extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key); int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported); /** * fscrypt_require_key() - require an inode's encryption key * @inode: the inode we need the key for * * If the inode is encrypted, set up its encryption key if not already done. * Then require that the key be present and return -ENOKEY otherwise. * * No locks are needed, and the key will live as long as the struct inode --- so * it won't go away from under you. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_require_key(struct inode *inode) { if (IS_ENCRYPTED(inode)) { int err = fscrypt_get_encryption_info(inode, false); if (err) return err; if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } return 0; } /* keysetup_v1.c */ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, const u8 *raw_master_key); int fscrypt_setup_v1_file_key_via_subscribed_keyrings( struct fscrypt_inode_info *ci); /* policy.c */ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec); const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size); const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir); #endif /* _FSCRYPT_PRIVATE_H */
8 152 163 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* AF_RXRPC internal definitions * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/atomic.h> #include <linux/seqlock.h> #include <linux/win_minmax.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> #include "protocol.h" #define FCRYPT_BSIZE 8 struct rxrpc_crypt { union { u8 x[FCRYPT_BSIZE]; __be32 n[2]; }; } __attribute__((aligned(8))); #define rxrpc_queue_work(WS) queue_work(rxrpc_workqueue, (WS)) #define rxrpc_queue_delayed_work(WS,D) \ queue_delayed_work(rxrpc_workqueue, (WS), (D)) struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; /* * Mark applied to socket buffers in skb->mark. skb->priority is used * to pass supplementary information. */ enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_ERROR, /* Error notification */ RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ }; /* * sk_state for RxRPC sockets */ enum { RXRPC_UNBOUND = 0, RXRPC_CLIENT_UNBOUND, /* Unbound socket used as client */ RXRPC_CLIENT_BOUND, /* client local address bound */ RXRPC_SERVER_BOUND, /* server local address bound */ RXRPC_SERVER_BOUND2, /* second server local address bound */ RXRPC_SERVER_LISTENING, /* server listening for connections */ RXRPC_SERVER_LISTEN_DISABLED, /* server listening disabled */ RXRPC_CLOSE, /* socket is being closed */ }; /* * Per-network namespace data. */ struct rxrpc_net { struct proc_dir_entry *proc_net; /* Subdir in /proc/net */ u32 epoch; /* Local epoch for detecting local-end reset */ struct list_head calls; /* List of calls active in this namespace */ spinlock_t call_lock; /* Lock for ->calls */ atomic_t nr_calls; /* Count of allocated calls */ atomic_t nr_conns; struct list_head bundle_proc_list; /* List of bundles for proc */ struct list_head conn_proc_list; /* List of conns in this namespace for proc */ struct list_head service_conns; /* Service conns in this namespace */ rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */ struct work_struct service_conn_reaper; struct timer_list service_conn_reap_timer; bool live; atomic_t nr_client_conns; struct hlist_head local_endpoints; struct mutex local_mutex; /* Lock for ->local_endpoints */ DECLARE_HASHTABLE (peer_hash, 10); spinlock_t peer_hash_lock; /* Lock for ->peer_hash */ #define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */ u8 peer_keepalive_cursor; time64_t peer_keepalive_base; struct list_head peer_keepalive[32]; struct list_head peer_keepalive_new; struct timer_list peer_keepalive_timer; struct work_struct peer_keepalive_work; atomic_t stat_tx_data; atomic_t stat_tx_data_retrans; atomic_t stat_tx_data_send; atomic_t stat_tx_data_send_frag; atomic_t stat_tx_data_send_fail; atomic_t stat_tx_data_underflow; atomic_t stat_tx_data_cwnd_reset; atomic_t stat_rx_data; atomic_t stat_rx_data_reqack; atomic_t stat_rx_data_jumbo; atomic_t stat_tx_ack_fill; atomic_t stat_tx_ack_send; atomic_t stat_tx_ack_skip; atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; atomic_t stat_why_req_ack[8]; atomic_t stat_io_loop; }; /* * Service backlog preallocation. * * This contains circular buffers of preallocated peers, connections and calls * for incoming service calls and their head and tail pointers. This allows * calls to be set up in the data_ready handler, thereby avoiding the need to * shuffle packets around so much. */ struct rxrpc_backlog { unsigned short peer_backlog_head; unsigned short peer_backlog_tail; unsigned short conn_backlog_head; unsigned short conn_backlog_tail; unsigned short call_backlog_head; unsigned short call_backlog_tail; #define RXRPC_BACKLOG_MAX 32 struct rxrpc_peer *peer_backlog[RXRPC_BACKLOG_MAX]; struct rxrpc_connection *conn_backlog[RXRPC_BACKLOG_MAX]; struct rxrpc_call *call_backlog[RXRPC_BACKLOG_MAX]; }; /* * RxRPC socket definition */ struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */ spinlock_t recvmsg_lock; /* Lock for recvmsg_q */ struct key *key; /* security for this socket */ struct key *securities; /* list of server security descriptors */ struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT bool exclusive; /* Exclusive connection for a client socket */ u16 second_service; /* Additional service bound to the endpoint */ struct { /* Service upgrade information */ u16 from; /* Service ID to upgrade (if not 0) */ u16 to; /* service ID to upgrade to */ } service_upgrade; sa_family_t family; /* Protocol family created with */ struct sockaddr_rxrpc srx; /* Primary Service/local addresses */ struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */ }; #define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk) /* * CPU-byteorder normalised Rx packet header. */ struct rxrpc_host_header { u32 epoch; /* client boot timestamp */ u32 cid; /* connection and channel ID */ u32 callNumber; /* call ID (0 for connection-level packets) */ u32 seq; /* sequence number of pkt in call stream */ u32 serial; /* serial number of pkt sent to network */ u8 type; /* packet type */ u8 flags; /* packet flags */ u8 userStatus; /* app-layer defined status */ u8 securityIndex; /* security protocol ID */ union { u16 _rsvd; /* reserved */ u16 cksum; /* kerberos security checksum */ }; u16 serviceId; /* service ID */ } __packed; /* * RxRPC socket buffer private variables * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { union { struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ u8 flags; #define RXRPC_RX_VERIFIED 0x01 }; struct { rxrpc_seq_t first_ack; /* First packet in acks table */ rxrpc_seq_t prev_ack; /* Highest seq seen */ rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */ u8 reason; /* Reason for ack */ u8 nr_acks; /* Number of acks+nacks */ u8 nr_nacks; /* Number of nacks */ } ack; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; #define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb) /* * RxRPC security module interface */ struct rxrpc_security { const char *name; /* name of this service */ u8 security_index; /* security type provided */ u32 no_key_abort; /* Abort code indicating no key */ /* Initialise a security service */ int (*init)(void); /* Clean up a security service */ void (*exit)(void); /* Parse the information from a server key */ int (*preparse_server_key)(struct key_preparsed_payload *); /* Clean up the preparse buffer after parsing a server key */ void (*free_preparse_server_key)(struct key_preparsed_payload *); /* Destroy the payload of a server key */ void (*destroy_server_key)(struct key *); /* Describe a server key */ void (*describe_server_key)(const struct key *, struct seq_file *); /* initialise a connection's security */ int (*init_connection_security)(struct rxrpc_connection *, struct rxrpc_key_token *); /* Work out how much data we can store in a packet, given an estimate * of the amount of data remaining and allocate a data buffer. */ struct rxrpc_txbuf *(*alloc_txbuf)(struct rxrpc_call *call, size_t remaining, gfp_t gfp); /* impose security on a packet */ int (*secure_packet)(struct rxrpc_call *, struct rxrpc_txbuf *); /* verify the security on a received packet */ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *); /* Free crypto request on a call */ void (*free_call_crypto)(struct rxrpc_call *); /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); /* respond to a challenge */ int (*respond_to_challenge)(struct rxrpc_connection *, struct sk_buff *); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, struct sk_buff *); /* clear connection security */ void (*clear)(struct rxrpc_connection *); }; /* * RxRPC local transport endpoint description * - owned by a single AF_RXRPC socket * - pointed to by transport socket struct sk_user_data */ struct rxrpc_local { struct rcu_head rcu; atomic_t active_users; /* Number of users of the local endpoint */ refcount_t ref; /* Number of references to the structure */ struct net *net; /* The network namespace */ struct rxrpc_net *rxnet; /* Our bits in the network namespace */ struct hlist_node link; struct socket *socket; /* my UDP socket */ struct task_struct *io_thread; struct completion io_thread_ready; /* Indication that the I/O thread started */ struct page_frag_cache tx_alloc; /* Tx control packet allocation (I/O thread only) */ struct rxrpc_sock *service; /* Service(s) listening on this endpoint */ #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY struct sk_buff_head rx_delay_queue; /* Delay injection queue */ #endif struct sk_buff_head rx_queue; /* Received packets */ struct list_head conn_attend_q; /* Conns requiring immediate attention */ struct list_head call_attend_q; /* Calls requiring immediate attention */ struct rb_root client_bundles; /* Client connection bundles by socket params */ spinlock_t client_bundles_lock; /* Lock for client_bundles */ bool kill_all_client_conns; struct list_head idle_client_conns; struct timer_list client_conn_reap_timer; unsigned long client_conn_flags; #define RXRPC_CLIENT_CONN_REAP_TIMER 0 /* The client conn reap timer expired */ spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ int debug_id; /* debug ID for printks */ bool dead; bool service_closed; /* Service socket closed */ struct idr conn_ids; /* List of connection IDs */ struct list_head new_client_calls; /* Newly created client calls need connection */ spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ }; /* * RxRPC remote transport endpoint definition * - matched by local endpoint, remote port, address and protocol type */ struct rxrpc_peer { struct rcu_head rcu; /* This must be first */ refcount_t ref; unsigned long hash_key; struct hlist_node hash_link; struct rxrpc_local *local; struct hlist_head error_targets; /* targets for net error distribution */ struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ unsigned int if_mtu; /* interface MTU for this peer */ unsigned int mtu; /* network MTU for this peer */ unsigned int maxdata; /* data size (MTU - hdrsize) */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ struct sockaddr_rxrpc srx; /* remote address */ /* calculated RTT cache */ #define RXRPC_RTT_CACHE_SIZE 32 spinlock_t rtt_input_lock; /* RTT lock for input routine */ ktime_t rtt_last_req; /* Time of last RTT request */ unsigned int rtt_count; /* Number of samples we've got */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ u32 rto_us; /* Retransmission timeout in usec */ u8 backoff; /* Backoff timeout (as shift) */ u8 cong_ssthresh; /* Congestion slow-start threshold */ }; /* * Keys for matching a connection. */ struct rxrpc_conn_proto { union { struct { u32 epoch; /* epoch of this connection */ u32 cid; /* connection ID */ }; u64 index_key; }; }; struct rxrpc_conn_parameters { struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Representation of remote endpoint */ struct key *key; /* Security details */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ u16 service_id; /* Service ID for this connection */ u32 security_level; /* Security level selected */ }; /* * Call completion condition (state == RXRPC_CALL_COMPLETE). */ enum rxrpc_call_completion { RXRPC_CALL_SUCCEEDED, /* - Normal termination */ RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ NR__RXRPC_CALL_COMPLETIONS }; /* * Bits in the connection flags. */ enum rxrpc_conn_flag { RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */ RXRPC_CONN_FINAL_ACK_0, /* Need final ACK for channel 0 */ RXRPC_CONN_FINAL_ACK_1, /* Need final ACK for channel 1 */ RXRPC_CONN_FINAL_ACK_2, /* Need final ACK for channel 2 */ RXRPC_CONN_FINAL_ACK_3, /* Need final ACK for channel 3 */ }; #define RXRPC_CONN_FINAL_ACK_MASK ((1UL << RXRPC_CONN_FINAL_ACK_0) | \ (1UL << RXRPC_CONN_FINAL_ACK_1) | \ (1UL << RXRPC_CONN_FINAL_ACK_2) | \ (1UL << RXRPC_CONN_FINAL_ACK_3)) /* * Events that can be raised upon a connection. */ enum rxrpc_conn_event { RXRPC_CONN_EV_CHALLENGE, /* Send challenge packet */ RXRPC_CONN_EV_ABORT_CALLS, /* Abort attached calls */ }; /* * The connection protocol state. */ enum rxrpc_conn_proto_state { RXRPC_CONN_UNUSED, /* Connection not yet attempted */ RXRPC_CONN_CLIENT_UNSECURED, /* Client connection needs security init */ RXRPC_CONN_CLIENT, /* Client connection */ RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */ RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */ RXRPC_CONN_SERVICE, /* Service secured connection */ RXRPC_CONN_ABORTED, /* Conn aborted */ RXRPC_CONN__NR_STATES }; /* * RxRPC client connection bundle. */ struct rxrpc_bundle { struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* Security details */ struct list_head proc_link; /* Link in net->bundle_proc_list */ const struct rxrpc_security *security; /* applied security module */ refcount_t ref; atomic_t active; /* Number of active users */ unsigned int debug_id; u32 security_level; /* Security level selected */ u16 service_id; /* Service ID for this connection */ bool try_upgrade; /* True if the bundle is attempting upgrade */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ unsigned short alloc_error; /* Error from last conn allocation */ struct rb_node local_node; /* Node in local->client_conns */ struct list_head waiting_calls; /* Calls waiting for channels */ unsigned long avail_chans; /* Mask of available channels */ unsigned int conn_ids[4]; /* Connection IDs. */ struct rxrpc_connection *conns[4]; /* The connections in the bundle (max 4) */ }; /* * RxRPC connection definition * - matched by { local, peer, epoch, conn_id, direction } * - each connection can only handle four simultaneous calls */ struct rxrpc_connection { struct rxrpc_conn_proto proto; struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Remote endpoint */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ struct key *key; /* Security details */ struct list_head attend_link; /* Link in local->conn_attend_q */ refcount_t ref; atomic_t active; /* Active count for service conns */ struct rcu_head rcu; struct list_head cache_link; unsigned char act_chans; /* Mask of active channels */ struct rxrpc_channel { unsigned long final_ack_at; /* Time at which to issue final ACK */ struct rxrpc_call *call; /* Active call */ unsigned int call_debug_id; /* call->debug_id */ u32 call_id; /* ID of current call */ u32 call_counter; /* Call ID counter */ u32 last_call; /* ID of last call */ u8 last_type; /* Type of last packet */ union { u32 last_seq; u32 last_abort; }; } channels[RXRPC_MAXCALLS]; struct timer_list timer; /* Conn event timer */ struct work_struct processor; /* connection event processor */ struct work_struct destructor; /* In-process-context destroyer */ struct rxrpc_bundle *bundle; /* Client connection bundle */ struct rb_node service_node; /* Node in peer->service_conns */ struct list_head proc_link; /* link in procfs list */ struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ struct page_frag_cache tx_data_alloc; /* Tx DATA packet allocation */ struct mutex tx_data_alloc_lock; struct mutex security_lock; /* Lock for security management */ const struct rxrpc_security *security; /* applied security module */ union { struct { struct crypto_sync_skcipher *cipher; /* encryption handle */ struct rxrpc_crypt csum_iv; /* packet checksum base */ u32 nonce; /* response re-use preventer */ } rxkad; }; unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ spinlock_t state_lock; /* state-change lock */ enum rxrpc_conn_proto_state state; /* current state of connection */ enum rxrpc_call_completion completion; /* Completion condition */ s32 abort_code; /* Abort code of connection abort */ int debug_id; /* debug ID for printks */ rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ u32 service_id; /* Service ID, possibly upgraded */ u32 security_level; /* Security level selected */ u8 security_ix; /* security type */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ u8 bundle_shift; /* Index into bundle->avail_chans */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ u16 orig_service_id; /* Originally requested service ID */ short error; /* Local error code */ }; static inline bool rxrpc_to_server(const struct rxrpc_skb_priv *sp) { return sp->hdr.flags & RXRPC_CLIENT_INITIATED; } static inline bool rxrpc_to_client(const struct rxrpc_skb_priv *sp) { return !rxrpc_to_server(sp); } /* * Flags in call->flags. */ enum rxrpc_call_flag { RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */ RXRPC_CALL_HAS_USERID, /* has a user ID attached */ RXRPC_CALL_IS_SERVICE, /* Call is service call */ RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */ RXRPC_CALL_KERNEL, /* The call was made by the kernel */ RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */ RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */ RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */ }; /* * Events that can be raised on a call. */ enum rxrpc_call_event { RXRPC_CALL_EV_ACK_LOST, /* ACK may be lost, send ping */ RXRPC_CALL_EV_INITIAL_PING, /* Send initial ping for a new service call */ }; /* * The states that a call can be in. */ enum rxrpc_call_state { RXRPC_CALL_UNINITIALISED, RXRPC_CALL_CLIENT_AWAIT_CONN, /* - client waiting for connection to become available */ RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ RXRPC_CALL_COMPLETE, /* - call complete */ NR__RXRPC_CALL_STATES }; /* * Call Tx congestion management modes. */ enum rxrpc_congest_mode { RXRPC_CALL_SLOW_START, RXRPC_CALL_CONGEST_AVOIDANCE, RXRPC_CALL_PACKET_LOSS, RXRPC_CALL_FAST_RETRANSMIT, NR__RXRPC_CONGEST_MODES }; /* * RxRPC call definition * - matched by { connection, call_id } */ struct rxrpc_call { struct rcu_head rcu; struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_bundle *bundle; /* Connection bundle to use */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_sock __rcu *socket; /* socket responsible */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ struct key *key; /* Security details */ const struct rxrpc_security *security; /* applied security module */ struct mutex user_mutex; /* User access mutex */ struct sockaddr_rxrpc dest_srx; /* Destination address */ ktime_t delay_ack_at; /* When DELAY ACK needs to happen */ ktime_t ack_lost_at; /* When ACK is figured as lost */ ktime_t resend_at; /* When next resend needs to happen */ ktime_t ping_at; /* When next to send a ping */ ktime_t keepalive_at; /* When next to send a keepalive ping */ ktime_t expect_rx_by; /* When we expect to get a packet by */ ktime_t expect_req_by; /* When we expect to get a request DATA packet by */ ktime_t expect_term_by; /* When we expect call termination by */ u32 next_rx_timo; /* Timeout for next Rx packet (ms) */ u32 next_req_timo; /* Timeout for next Rx request packet (ms) */ u32 hard_timo; /* Maximum lifetime or 0 (s) */ struct timer_list timer; /* Combined event timer */ struct work_struct destroyer; /* In-process-context destroyer */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ struct list_head wait_link; /* Link in local->new_client_calls */ struct hlist_node error_link; /* link in error distribution list */ struct list_head accept_link; /* Link in rx->acceptq */ struct list_head recvmsg_link; /* Link in rx->recvmsg_q */ struct list_head sock_link; /* Link in rx->sock_calls */ struct rb_node sock_node; /* Node in rx->calls */ struct list_head attend_link; /* Link in local->call_attend_q */ struct rxrpc_txbuf *tx_pending; /* Tx buffer being filled */ wait_queue_head_t waitq; /* Wait queue for channel or Tx */ s64 tx_total_len; /* Total length left to be transmitted (or -1) */ unsigned long user_call_ID; /* user-defined call ID */ unsigned long flags; unsigned long events; spinlock_t notify_lock; /* Kernel notification lock */ unsigned int send_abort_why; /* Why the abort [enum rxrpc_abort_reason] */ s32 send_abort; /* Abort code to be sent */ short send_abort_err; /* Error to be associated with the abort */ rxrpc_seq_t send_abort_seq; /* DATA packet that incurred the abort (or 0) */ s32 abort_code; /* Local/remote abort code */ int error; /* Local error incurred */ enum rxrpc_call_state _state; /* Current state of call (needs barrier) */ enum rxrpc_call_completion completion; /* Call completion condition */ refcount_t ref; u8 security_ix; /* Security type */ enum rxrpc_interruptibility interruptibility; /* At what point call may be interrupted */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 security_level; /* Security level selected */ int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ /* Transmitted data tracking. */ spinlock_t tx_lock; /* Transmit queue lock */ struct list_head tx_sendmsg; /* Sendmsg prepared packets */ struct list_head tx_buffer; /* Buffer of transmissible packets */ rxrpc_seq_t tx_bottom; /* First packet in buffer */ rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */ u8 tx_winsize; /* Maximum size of Tx window */ #define RXRPC_TX_MAX_WINDOW 128 ktime_t tx_last_sent; /* Last time a transmission occurred */ /* Received data tracking */ struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */ rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */ rxrpc_seq_t rx_consumed; /* Highest packet consumed */ rxrpc_serial_t rx_serial; /* Highest serial received for this call */ u8 rx_winsize; /* Size of Rx window */ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS * is fixed, we keep these numbers in terms of segments (ie. DATA * packets) rather than bytes. */ #define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN #define RXRPC_MIN_CWND 4 u8 cong_cwnd; /* Congestion window size */ u8 cong_extra; /* Extra to send for congestion management */ u8 cong_ssthresh; /* Slow-start threshold */ enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */ u8 cong_dup_acks; /* Count of ACKs showing missing packets */ u8 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ struct sk_buff *cong_last_nack; /* Last ACK with nacks received */ /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ u16 ackr_sack_base; /* Starting slot in SACK table ring */ rxrpc_seq_t ackr_window; /* Base of SACK window */ rxrpc_seq_t ackr_wtop; /* Base of SACK window */ unsigned int ackr_nr_unacked; /* Number of unacked packets */ atomic_t ackr_nr_consumed; /* Number of packets needing hard ACK */ struct { #define RXRPC_SACK_SIZE 256 /* SACK table for soft-acked packets */ u8 ackr_sack_table[RXRPC_SACK_SIZE]; } __aligned(8); /* RTT management */ rxrpc_serial_t rtt_serial[4]; /* Serial number of DATA or PING sent */ ktime_t rtt_sent_at[4]; /* Time packet sent */ unsigned long rtt_avail; /* Mask of available slots in bits 0-3, * Mask of pending samples in 8-11 */ #define RXRPC_CALL_RTT_AVAIL_MASK 0xf #define RXRPC_CALL_RTT_PEND_SHIFT 8 /* Transmission-phase ACK management (ACKs we've received). */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ rxrpc_seq_t acks_first_seq; /* first sequence number received */ rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ }; /* * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { u16 nr_acks; /* Number of ACKs in packet */ u16 nr_new_acks; /* Number of new ACKs in packet */ u16 nr_new_nacks; /* Number of new nacks in packet */ u16 nr_retained_nacks; /* Number of nacks retained between ACKs */ u8 ack_reason; bool saw_nacks; /* Saw NACKs in packet */ bool new_low_nack; /* T if new low NACK found */ bool retrans_timeo; /* T if reTx due to timeout happened */ u8 flight_size; /* Number of unreceived transmissions */ /* Place to stash values for tracing */ enum rxrpc_congest_mode mode:8; u8 cwnd; u8 ssthresh; u8 dup_acks; u8 cumulative_acks; }; /* * sendmsg() cmsg-specified parameters. */ enum rxrpc_command { RXRPC_CMD_SEND_DATA, /* send data message */ RXRPC_CMD_SEND_ABORT, /* request abort generation */ RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ RXRPC_CMD_CHARGE_ACCEPT, /* [server] charge accept preallocation */ }; struct rxrpc_call_params { s64 tx_total_len; /* Total Tx data length (if send data) */ unsigned long user_call_ID; /* User's call ID */ struct { u32 hard; /* Maximum lifetime (sec) */ u32 idle; /* Max time since last data packet (msec) */ u32 normal; /* Max time since last call packet (msec) */ } timeouts; u8 nr_timeouts; /* Number of timeouts specified */ bool kernel; /* T if kernel is making the call */ enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */ }; struct rxrpc_send_params { struct rxrpc_call_params call; u32 abort_code; /* Abort code to Tx (if abort) */ enum rxrpc_command command : 8; /* The command to implement */ bool exclusive; /* Shared or exclusive call */ bool upgrade; /* If the connection is upgradeable */ }; /* * Buffer of data to be output as a packet. */ struct rxrpc_txbuf { struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */ struct list_head tx_link; /* Link in live Enc queue or Tx queue */ ktime_t last_sent; /* Time at which last transmitted */ refcount_t ref; rxrpc_seq_t seq; /* Sequence number of this packet */ rxrpc_serial_t serial; /* Last serial number transmitted with */ unsigned int call_debug_id; unsigned int debug_id; unsigned int len; /* Amount of data in buffer */ unsigned int space; /* Remaining data space */ unsigned int offset; /* Offset of fill point */ unsigned int flags; #define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */ #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ __be16 cksum; /* Checksum to go in header */ unsigned short ack_rwind; /* ACK receive window */ u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ u8 nr_kvec; /* Amount of kvec[] used */ struct kvec kvec[3]; }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) { return txb->flags & RXRPC_CLIENT_INITIATED; } static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) { return !rxrpc_sending_to_server(txb); } #include <trace/events/rxrpc.h> /* * Allocate the next serial number on a connection. 0 must be skipped. */ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn) { rxrpc_serial_t serial; serial = conn->tx_serial; if (serial == 0) serial = 1; conn->tx_serial = serial + 1; return serial; } /* * af_rxrpc.c */ extern atomic_t rxrpc_n_rx_skbs; extern struct workqueue_struct *rxrpc_workqueue; /* * call_accept.c */ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); bool rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); /* * call_event.c */ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *); void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb); bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); /* * call_object.c */ extern const char *const rxrpc_call_states[]; extern const char *const rxrpc_call_completions[]; extern struct kmem_cache *rxrpc_call_jar; void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what); struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct rxrpc_call_params *, gfp_t, unsigned int); void rxrpc_start_call_timer(struct rxrpc_call *call); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); void rxrpc_see_call(struct rxrpc_call *, enum rxrpc_call_trace); struct rxrpc_call *rxrpc_try_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_cleanup_call(struct rxrpc_call *); void rxrpc_destroy_all_calls(struct rxrpc_net *); static inline bool rxrpc_is_service_call(const struct rxrpc_call *call) { return test_bit(RXRPC_CALL_IS_SERVICE, &call->flags); } static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) { return !rxrpc_is_service_call(call); } /* * call_state.c */ bool rxrpc_set_call_completion(struct rxrpc_call *call, enum rxrpc_call_completion compl, u32 abort_code, int error); bool rxrpc_call_completed(struct rxrpc_call *call); bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, u32 abort_code, int error, enum rxrpc_abort_reason why); void rxrpc_prefail_call(struct rxrpc_call *call, enum rxrpc_call_completion compl, int error); static inline void rxrpc_set_call_state(struct rxrpc_call *call, enum rxrpc_call_state state) { /* Order write of completion info before write of ->state. */ smp_store_release(&call->_state, state); wake_up(&call->waitq); } static inline enum rxrpc_call_state __rxrpc_call_state(const struct rxrpc_call *call) { return call->_state; /* Only inside I/O thread */ } static inline bool __rxrpc_call_is_complete(const struct rxrpc_call *call) { return __rxrpc_call_state(call) == RXRPC_CALL_COMPLETE; } static inline enum rxrpc_call_state rxrpc_call_state(const struct rxrpc_call *call) { /* Order read ->state before read of completion info. */ return smp_load_acquire(&call->_state); } static inline bool rxrpc_call_is_complete(const struct rxrpc_call *call) { return rxrpc_call_state(call) == RXRPC_CALL_COMPLETE; } static inline bool rxrpc_call_has_failed(const struct rxrpc_call *call) { return rxrpc_call_is_complete(call) && call->completion != RXRPC_CALL_SUCCEEDED; } /* * conn_client.c */ extern unsigned int rxrpc_reap_client_connections; extern unsigned long rxrpc_conn_idle_client_expiry; extern unsigned long rxrpc_conn_idle_client_fast_expiry; void rxrpc_purge_client_connections(struct rxrpc_local *local); struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); void rxrpc_put_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp); void rxrpc_connect_client_calls(struct rxrpc_local *local); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); void rxrpc_discard_expired_client_conns(struct rxrpc_local *local); void rxrpc_clean_up_local_conns(struct rxrpc_local *); /* * conn_event.c */ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct sk_buff *skb, unsigned int channel); int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, s32 abort_code, int err, enum rxrpc_abort_reason why); void rxrpc_process_connection(struct work_struct *); void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb); static inline bool rxrpc_is_conn_aborted(const struct rxrpc_connection *conn) { /* Order reading the abort info after the state check. */ return smp_load_acquire(&conn->state) == RXRPC_CONN_ABORTED; } /* * conn_object.c */ extern unsigned int rxrpc_connection_expiry; extern unsigned int rxrpc_closed_conn_expiry; void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why); struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *, gfp_t); struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *, struct sockaddr_rxrpc *, struct sk_buff *); void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_client_conn(struct rxrpc_connection *); void rxrpc_queue_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_see_connection(struct rxrpc_connection *, enum rxrpc_conn_trace); struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *, enum rxrpc_conn_trace); struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_put_connection(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_service_connection_reaper(struct work_struct *); void rxrpc_destroy_all_connections(struct rxrpc_net *); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) { return conn->out_clientflag; } static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) { return !rxrpc_conn_is_client(conn); } static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn, unsigned long expire_at) { timer_reduce(&conn->timer, expire_at); } /* * conn_service.c */ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t); void rxrpc_new_incoming_connection(struct rxrpc_sock *, struct rxrpc_connection *, const struct rxrpc_security *, struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c */ void rxrpc_congestion_degrade(struct rxrpc_call *); void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); /* * io_thread.c */ int rxrpc_encap_rcv(struct sock *, struct sk_buff *); void rxrpc_error_report(struct sock *); bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err); int rxrpc_io_thread(void *data); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { wake_up_process(READ_ONCE(local->io_thread)); } static inline bool rxrpc_protocol_error(struct sk_buff *skb, enum rxrpc_abort_reason why) { return rxrpc_direct_abort(skb, why, RX_PROTOCOL_ERROR, -EPROTO); } /* * insecure.c */ extern const struct rxrpc_security rxrpc_no_security; /* * key.c */ extern struct key_type key_type_rxrpc; int rxrpc_request_key(struct rxrpc_sock *, sockptr_t , int); int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, u32); /* * local_event.c */ void rxrpc_gen_version_string(void); void rxrpc_send_version_request(struct rxrpc_local *local, struct rxrpc_host_header *hdr, struct sk_buff *skb); /* * local_object.c */ void rxrpc_local_dont_fragment(const struct rxrpc_local *local, bool set); struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *); struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *, enum rxrpc_local_trace); struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_put_local(struct rxrpc_local *, enum rxrpc_local_trace); struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_unuse_local(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_destroy_local(struct rxrpc_local *local); void rxrpc_destroy_all_locals(struct rxrpc_net *); static inline bool __rxrpc_use_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { int r, u; r = refcount_read(&local->ref); u = atomic_fetch_add_unless(&local->active_users, 1, 0); trace_rxrpc_local(local->debug_id, why, r, u); return u != 0; } static inline void rxrpc_see_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { int r, u; r = refcount_read(&local->ref); u = atomic_read(&local->active_users); trace_rxrpc_local(local->debug_id, why, r, u); } /* * misc.c */ extern unsigned int rxrpc_max_backlog __read_mostly; extern unsigned long rxrpc_soft_ack_delay; extern unsigned long rxrpc_idle_ack_delay; extern unsigned int rxrpc_rx_window_size; extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY extern unsigned long rxrpc_inject_rx_delay; #endif /* * net_ns.c */ extern unsigned int rxrpc_net_id; extern struct pernet_operations rxrpc_net_ops; static inline struct rxrpc_net *rxrpc_net(struct net *net) { return net_generic(net, rxrpc_net_id); } /* * output.c */ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); int rxrpc_send_abort_packet(struct rxrpc_call *); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); /* * peer_event.c */ void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *); void rxrpc_peer_keepalive_worker(struct work_struct *); /* * peer_object.c */ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, const struct sockaddr_rxrpc *); struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, gfp_t gfp); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t, enum rxrpc_peer_trace); void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer); void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *, enum rxrpc_peer_trace); void rxrpc_put_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); /* * proc.c */ extern const struct seq_operations rxrpc_call_seq_ops; extern const struct seq_operations rxrpc_connection_seq_ops; extern const struct seq_operations rxrpc_bundle_seq_ops; extern const struct seq_operations rxrpc_peer_seq_ops; extern const struct seq_operations rxrpc_local_seq_ops; /* * recvmsg.c */ void rxrpc_notify_socket(struct rxrpc_call *); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* * Abort a call due to a protocol error. */ static inline int rxrpc_abort_eproto(struct rxrpc_call *call, struct sk_buff *skb, s32 abort_code, enum rxrpc_abort_reason why) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); rxrpc_abort_call(call, sp->hdr.seq, abort_code, -EPROTO, why); return -EPROTO; } /* * rtt.c */ void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int, rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans); void rxrpc_peer_init_rtt(struct rxrpc_peer *); /* * rxkad.c */ #ifdef CONFIG_RXKAD extern const struct rxrpc_security rxkad; #endif /* * security.c */ int __init rxrpc_init_security(void); const struct rxrpc_security *rxrpc_security_lookup(u8); void rxrpc_exit_security(void); int rxrpc_init_client_call_security(struct rxrpc_call *); int rxrpc_init_client_conn_security(struct rxrpc_connection *); const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *, struct sk_buff *); struct key *rxrpc_look_up_server_security(struct rxrpc_connection *, struct sk_buff *, u32, u32); /* * sendmsg.c */ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, enum rxrpc_abort_reason why); int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* * server_key.c */ extern struct key_type key_type_rxrpc_s; int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); /* * skbuff.c */ void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_eaten_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_purge_queue(struct sk_buff_head *); /* * stats.c */ int rxrpc_stats_show(struct seq_file *seq, void *v); int rxrpc_stats_clear(struct file *file, char *buf, size_t size); #define rxrpc_inc_stat(rxnet, s) atomic_inc(&(rxnet)->s) #define rxrpc_dec_stat(rxnet, s) atomic_dec(&(rxnet)->s) /* * sysctl.c */ #ifdef CONFIG_SYSCTL extern int __init rxrpc_sysctl_init(void); extern void rxrpc_sysctl_exit(void); #else static inline int __init rxrpc_sysctl_init(void) { return 0; } static inline void rxrpc_sysctl_exit(void) {} #endif /* * txbuf.c */ extern atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp); struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size); void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); /* * utils.c */ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); static inline bool before(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) < 0; } static inline bool before_eq(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) <= 0; } static inline bool after(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) > 0; } static inline bool after_eq(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) >= 0; } /* * debug tracing */ extern unsigned int rxrpc_debug; #define dbgprintk(FMT,...) \ printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) #if defined(__KDEBUG) #define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) #define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) #define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) #elif defined(CONFIG_AF_RXRPC_DEBUG) #define RXRPC_DEBUG_KENTER 0x01 #define RXRPC_DEBUG_KLEAVE 0x02 #define RXRPC_DEBUG_KDEBUG 0x04 #define _enter(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KENTER)) \ kenter(FMT,##__VA_ARGS__); \ } while (0) #define _leave(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KLEAVE)) \ kleave(FMT,##__VA_ARGS__); \ } while (0) #define _debug(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KDEBUG)) \ kdebug(FMT,##__VA_ARGS__); \ } while (0) #else #define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) #endif /* * debug assertion checking */ #if 1 // defined(__KDEBUGALL) #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) #define ASSERTCMP(X, OP, Y) \ do { \ __typeof__(X) _x = (X); \ __typeof__(Y) _y = (__typeof__(X))(Y); \ if (unlikely(!(_x OP _y))) { \ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ (unsigned long)_x, (unsigned long)_x, #OP, \ (unsigned long)_y, (unsigned long)_y); \ BUG(); \ } \ } while (0) #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ __typeof__(X) _x = (X); \ __typeof__(Y) _y = (__typeof__(X))(Y); \ if (unlikely((C) && !(_x OP _y))) { \ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ (unsigned long)_x, (unsigned long)_x, #OP, \ (unsigned long)_y, (unsigned long)_y); \ BUG(); \ } \ } while (0) #else #define ASSERT(X) \ do { \ } while (0) #define ASSERTCMP(X, OP, Y) \ do { \ } while (0) #define ASSERTIF(C, X) \ do { \ } while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ } while (0) #endif /* __KDEBUGALL */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SOCK_REUSEPORT_H #define _SOCK_REUSEPORT_H #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/spinlock.h> #include <net/sock.h> extern spinlock_t reuseport_lock; struct sock_reuseport { struct rcu_head rcu; u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ u16 num_closed_socks; /* closed elements in socks */ u16 incoming_cpu; /* The last synq overflow event timestamp of this * reuse->socks[] group. */ unsigned int synq_overflow_ts; /* ID stays the same even after the size of socks[] grows. */ unsigned int reuseport_id; unsigned int bind_inany:1; unsigned int has_conns:1; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[] __counted_by(max_socks); }; extern int reuseport_alloc(struct sock *sk, bool bind_inany); extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); void reuseport_stop_listen_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); struct sock *reuseport_migrate_sock(struct sock *sk, struct sock *migrating_sk, struct sk_buff *skb); extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); extern int reuseport_detach_prog(struct sock *sk); static inline bool reuseport_has_conns(struct sock *sk) { struct sock_reuseport *reuse; bool ret = false; rcu_read_lock(); reuse = rcu_dereference(sk->sk_reuseport_cb); if (reuse && reuse->has_conns) ret = true; rcu_read_unlock(); return ret; } void reuseport_has_conns_set(struct sock *sk); void reuseport_update_incoming_cpu(struct sock *sk, int val); #endif /* _SOCK_REUSEPORT_H */
353 353 124 354 354 352 434 403 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_DST_METADATA_H #define __NET_DST_METADATA_H 1 #include <linux/skbuff.h> #include <net/ip_tunnels.h> #include <net/macsec.h> #include <net/dst.h> enum metadata_type { METADATA_IP_TUNNEL, METADATA_HW_PORT_MUX, METADATA_MACSEC, METADATA_XFRM, }; struct hw_port_info { struct net_device *lower_dev; u32 port_id; }; struct macsec_info { sci_t sci; }; struct xfrm_md_info { u32 if_id; int link; struct dst_entry *dst_orig; }; struct metadata_dst { struct dst_entry dst; enum metadata_type type; union { struct ip_tunnel_info tun_info; struct hw_port_info port_info; struct macsec_info macsec_info; struct xfrm_md_info xfrm_info; } u; }; static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb) { struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb); if (md_dst && md_dst->dst.flags & DST_METADATA) return md_dst; return NULL; } static inline struct ip_tunnel_info * skb_tunnel_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; if (md_dst && md_dst->type == METADATA_IP_TUNNEL) return &md_dst->u.tun_info; dst = skb_dst(skb); if (dst && dst->lwtstate && (dst->lwtstate->type == LWTUNNEL_ENCAP_IP || dst->lwtstate->type == LWTUNNEL_ENCAP_IP6)) return lwt_tun_info(dst->lwtstate); return NULL; } static inline struct xfrm_md_info *lwt_xfrm_info(struct lwtunnel_state *lwt) { return (struct xfrm_md_info *)lwt->data; } static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; if (md_dst && md_dst->type == METADATA_XFRM) return &md_dst->u.xfrm_info; dst = skb_dst(skb); if (dst && dst->lwtstate && dst->lwtstate->type == LWTUNNEL_ENCAP_XFRM) return lwt_xfrm_info(dst->lwtstate); return NULL; } static inline bool skb_valid_dst(const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); return dst && !(dst->flags & DST_METADATA); } static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, const struct sk_buff *skb_b) { const struct metadata_dst *a, *b; if (!(skb_a->_skb_refdst | skb_b->_skb_refdst)) return 0; a = (const struct metadata_dst *) skb_dst(skb_a); b = (const struct metadata_dst *) skb_dst(skb_b); if (!a != !b || a->type != b->type) return 1; switch (a->type) { case METADATA_HW_PORT_MUX: return memcmp(&a->u.port_info, &b->u.port_info, sizeof(a->u.port_info)); case METADATA_IP_TUNNEL: return memcmp(&a->u.tun_info, &b->u.tun_info, sizeof(a->u.tun_info) + a->u.tun_info.options_len); case METADATA_MACSEC: return memcmp(&a->u.macsec_info, &b->u.macsec_info, sizeof(a->u.macsec_info)); case METADATA_XFRM: return memcmp(&a->u.xfrm_info, &b->u.xfrm_info, sizeof(a->u.xfrm_info)); default: return 1; } } void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags); static inline struct metadata_dst *tun_rx_dst(int md_size) { struct metadata_dst *tun_dst; tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC); if (!tun_dst) return NULL; tun_dst->u.tun_info.options_len = 0; tun_dst->u.tun_info.mode = 0; return tun_dst; } static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); int md_size; struct metadata_dst *new_md; if (!md_dst || md_dst->type != METADATA_IP_TUNNEL) return ERR_PTR(-EINVAL); md_size = md_dst->u.tun_info.options_len; new_md = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC); if (!new_md) return ERR_PTR(-ENOMEM); unsafe_memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, sizeof(struct ip_tunnel_info) + md_size, /* metadata_dst_alloc() reserves room (md_size bytes) for * options right after the ip_tunnel_info struct. */); #ifdef CONFIG_DST_CACHE /* Unclone the dst cache if there is one */ if (new_md->u.tun_info.dst_cache.cache) { int ret; ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC); if (ret) { metadata_dst_free(new_md); return ERR_PTR(ret); } } #endif skb_dst_drop(skb); skb_dst_set(skb, &new_md->dst); return new_md; } static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb) { struct metadata_dst *dst; dst = tun_dst_unclone(skb); if (IS_ERR(dst)) return NULL; return &dst->u.tun_info; } static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, __be32 daddr, __u8 tos, __u8 ttl, __be16 tp_dst, const unsigned long *flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; ip_tunnel_key_init(&tun_dst->u.tun_info.key, saddr, daddr, tos, ttl, 0, 0, tp_dst, tunnel_id, flags); return tun_dst; } static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, const unsigned long *flags, __be64 tunnel_id, int md_size) { const struct iphdr *iph = ip_hdr(skb); return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, 0, flags, tunnel_id, md_size); } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 tos, __u8 ttl, __be16 tp_dst, __be32 label, const unsigned long *flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; info = &tun_dst->u.tun_info; info->mode = IP_TUNNEL_INFO_IPV6; ip_tunnel_flags_copy(info->key.tun_flags, flags); info->key.tun_id = tunnel_id; info->key.tp_src = 0; info->key.tp_dst = tp_dst; info->key.u.ipv6.src = *saddr; info->key.u.ipv6.dst = *daddr; info->key.tos = tos; info->key.ttl = ttl; info->key.label = label; return tun_dst; } static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, const unsigned long *flags, __be64 tunnel_id, int md_size) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr, ipv6_get_dsfield(ip6h), ip6h->hop_limit, 0, ip6_flowlabel(ip6h), flags, tunnel_id, md_size); } #endif /* __NET_DST_METADATA_H */
891 27 30 801 194 50 871 872 799 255 875 158 586 587 458 462 297 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0-or-later /* * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IP/TCP/UDP checksumming routines * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Tom May, <ftom@netcom.com> * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de> * Lots of code moved from tcp.c and ip.c; see those files * for more names. * * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: * Fixed some nasty bugs, causing some horrible crashes. * A: At some points, the sum (%0) was used as * length-counter instead of the length counter * (%1). Thanks to Roman Hodek for pointing this out. * B: GCC seems to mess up if one uses too many * data-registers to hold input values and one tries to * specify d0 and d1 as scratch registers. Letting gcc * choose these registers itself solves the problem. */ /* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access kills, so most of the assembly has to go. */ #include <linux/export.h> #include <net/checksum.h> #include <asm/byteorder.h> #ifndef do_csum static unsigned int do_csum(const unsigned char *buff, int len) { int odd; unsigned int result = 0; if (len <= 0) goto out; odd = 1 & (unsigned long) buff; if (odd) { #ifdef __LITTLE_ENDIAN result += (*buff << 8); #else result = *buff; #endif len--; buff++; } if (len >= 2) { if (2 & (unsigned long) buff) { result += *(unsigned short *) buff; len -= 2; buff += 2; } if (len >= 4) { const unsigned char *end = buff + ((unsigned)len & ~3); unsigned int carry = 0; do { unsigned int w = *(unsigned int *) buff; buff += 4; result += carry; result += w; carry = (w > result); } while (buff < end); result += carry; result = (result & 0xffff) + (result >> 16); } if (len & 2) { result += *(unsigned short *) buff; buff += 2; } } if (len & 1) #ifdef __LITTLE_ENDIAN result += *buff; #else result += (*buff << 8); #endif result = csum_from32to16(result); if (odd) result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); out: return result; } #endif #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { return (__force __sum16)~do_csum(iph, ihl*4); } EXPORT_SYMBOL(ip_fast_csum); #endif /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ __wsum csum_partial(const void *buff, int len, __wsum wsum) { unsigned int sum = (__force unsigned int)wsum; unsigned int result = do_csum(buff, len); /* add in old sum, and carry.. */ result += sum; if (sum > result) result += 1; return (__force __wsum)result; } EXPORT_SYMBOL(csum_partial); /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ __sum16 ip_compute_csum(const void *buff, int len) { return (__force __sum16)~do_csum(buff, len); } EXPORT_SYMBOL(ip_compute_csum); #ifndef csum_tcpudp_nofold static inline u32 from64to32(u64 x) { /* add up 32-bit and 32-bit for 32+c bit */ x = (x & 0xffffffff) + (x >> 32); /* add up carry.. */ x = (x & 0xffffffff) + (x >> 32); return (u32)x; } __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { unsigned long long s = (__force u32)sum; s += (__force u32)saddr; s += (__force u32)daddr; #ifdef __BIG_ENDIAN s += proto + len; #else s += (proto + len) << 8; #endif return (__force __wsum)from64to32(s); } EXPORT_SYMBOL(csum_tcpudp_nofold); #endif
481 331 1 16 105 34 417 3 315 284 33 314 314 7 315 315 314 19 136 98 81 5 231 81 1 1 160 160 159 160 159 2 158 160 158 2 160 130 38 169 169 169 169 169 9 160 160 2 2 1 1 1 8 1 7 7 4 2 100 98 3 3 2 2 1 1009 943 83 152 65 207 12 207 30 331 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 // SPDX-License-Identifier: GPL-2.0-or-later /* linux/net/ipv4/arp.c * * Copyright (C) 1994 by Florian La Roche * * This module implements the Address Resolution Protocol ARP (RFC 826), * which is used to convert IP addresses (or in the future maybe other * high-level addresses) into a low-level hardware address (like an Ethernet * address). * * Fixes: * Alan Cox : Removed the Ethernet assumptions in * Florian's code * Alan Cox : Fixed some small errors in the ARP * logic * Alan Cox : Allow >4K in /proc * Alan Cox : Make ARP add its own protocol entry * Ross Martin : Rewrote arp_rcv() and arp_get_info() * Stephen Henson : Add AX25 support to arp_get_info() * Alan Cox : Drop data when a device is downed. * Alan Cox : Use init_timer(). * Alan Cox : Double lock fixes. * Martin Seine : Move the arphdr structure * to if_arp.h for compatibility. * with BSD based programs. * Andrew Tridgell : Added ARP netmask code and * re-arranged proxy handling. * Alan Cox : Changed to use notifiers. * Niibe Yutaka : Reply for this device or proxies only. * Alan Cox : Don't proxy across hardware types! * Jonathan Naylor : Added support for NET/ROM. * Mike Shaver : RFC1122 checks. * Jonathan Naylor : Only lookup the hardware address for * the correct hardware type. * Germano Caronni : Assorted subtle races. * Craig Schlenter : Don't modify permanent entry * during arp_rcv. * Russ Nelson : Tidied up a few bits. * Alexey Kuznetsov: Major changes to caching and behaviour, * eg intelligent arp probing and * generation * of host down events. * Alan Cox : Missing unlock in device events. * Eckes : ARP ioctl control errors. * Alexey Kuznetsov: Arp free fix. * Manuel Rodriguez: Gratuitous ARP. * Jonathan Layes : Added arpd support through kerneld * message queue (960314) * Mike Shaver : /proc/sys/net/ipv4/arp_* support * Mike McLagan : Routing by source * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... * Jes Sorensen : Make FDDI work again in 2.1.x and * clean up the APFDDI & gen. FDDI bits. * Alexey Kuznetsov: new arp state machine; * now it is in net/core/neighbour.c. * Krzysztof Halasa: Added Frame Relay ARP support. * Arnaldo C. Melo : convert /proc/net/arp to seq_file * Shmulik Hen: Split arp_send to arp_create and * arp_xmit so intermediate drivers like * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/fddidevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> #include <net/dst_metadata.h> #include <net/ip_tunnels.h> #include <linux/uaccess.h> #include <linux/netfilter_arp.h> /* * Interface to generic neighbour cache. */ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool arp_key_eq(const struct neighbour *n, const void *pkey); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); static int arp_is_multicast(const void *pkey); static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops arp_direct_ops = { .family = AF_INET, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table arp_tbl = { .family = AF_INET, .key_len = 4, .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .is_multicast = arp_is_multicast, .id = "arp_cache", .parms = { .tbl = &arp_tbl, .reachable_time = 30 * HZ, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, [NEIGH_VAR_LOCKTIME] = 1 * HZ, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL(arp_tbl); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: ip_eth_mc_map(addr, haddr); return 0; case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; case ARPHRD_IPGRE: ip_ipgre_mc_map(addr, dev->broadcast, haddr); return 0; default: if (dir) { memcpy(haddr, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return arp_hashfn(pkey, dev, hash_rnd); } static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) { return neigh_key_eq32(neigh, pkey); } static int arp_constructor(struct neighbour *neigh) { __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; u32 inaddr_any = INADDR_ANY; if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return -EINVAL; } neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) ARPHRD_ETHER: (ethernet, apfddi) ARPHRD_FDDI: (fddi) ARPHRD_IEEE802: (tr) ARPHRD_METRICOM: (strip) ARPHRD_ARCNET: etc. etc. etc. ARPHRD_IPDDP will also work, if author repairs it. I did not it, because this driver does not work even in old paradigm. */ if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); } else if (neigh->type == RTN_BROADCAST || (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } return 0; } static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) { dst_link_failure(skb); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); } /* Create and send an arp packet. */ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw, struct dst_entry *dst) { struct sk_buff *skb; /* arp on this interface. */ if (dev->flags & IFF_NOARP) return; skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); if (!skb) return; skb_dst_set(skb, dst_clone(dst)); arp_xmit(skb); } void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw, NULL); } EXPORT_SYMBOL(arp_send); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; struct net_device *dev = neigh->dev; __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return; } switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ if (skb && inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = ip_hdr(skb)->saddr; if (inet_addr_type_dev_table(dev_net(dev), dev, saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; } saddr = 0; break; case 2: /* Avoid secondary IPs, get a primary/preferred one */ break; } rcu_read_unlock(); if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) pr_debug("trying to ucast probe in NUD_INVALID\n"); neigh_ha_snapshot(dst_ha, neigh, dev); dst_hw = dst_ha; } else { probes -= NEIGH_VAR(neigh->parms, APP_PROBES); if (probes < 0) { neigh_app_ns(neigh); return; } } if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) dst = skb_dst(skb); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { case 0: /* Reply, the tip is already validated */ return 0; case 1: /* Reply only if tip is configured on the incoming interface */ sip = 0; scope = RT_SCOPE_HOST; break; case 2: /* * Reply only if tip is configured on the incoming interface * and is in same subnet as sip */ scope = RT_SCOPE_HOST; break; case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; in_dev = NULL; break; case 4: /* Reserved */ case 5: case 6: case 7: return 0; case 8: /* Do not reply */ return 1; default: return 0; } return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_accept(struct in_device *in_dev, __be32 sip) { struct net *net = dev_net(in_dev->dev); int scope = RT_SCOPE_LINK; switch (IN_DEV_ARP_ACCEPT(in_dev)) { case 0: /* Don't create new entries from garp */ return 0; case 1: /* Create new entries from garp */ return 1; case 2: /* Create a neighbor in the arp table only if sip * is in the same subnet as an address configured * on the interface that received the garp message */ return !!inet_confirm_addr(net, in_dev, sip, 0, scope); default: return 0; } } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { struct rtable *rt; int flag = 0; /*unsigned long now; */ struct net *net = dev_net(dev); rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev), RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { __NET_INC_STATS(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); return flag; } /* * Check if we can use proxy ARP for this path */ static inline int arp_fwd_proxy(struct in_device *in_dev, struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; if (rt->dst.dev == dev) return 0; if (!IN_DEV_PROXY_ARP(in_dev)) return 0; imi = IN_DEV_MEDIUM_ID(in_dev); if (imi == 0) return 1; if (imi == -1) return 0; /* place to check for proxy_arp for routes */ out_dev = __in_dev_get_rcu(rt->dst.dev); if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); return omi != imi && omi != -1; } /* * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) * * RFC3069 supports proxy arp replies back to the same interface. This * is done to support (ethernet) switch features, like RFC 3069, where * the individual ports are not allowed to communicate with each * other, BUT they are allowed to talk to the upstream router. As * described in RFC 3069, it is possible to allow these hosts to * communicate through the upstream router, by proxy_arp'ing. * * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" * * This technology is known by different names: * In RFC 3069 it is called VLAN Aggregation. * Cisco and Allied Telesyn call it Private VLAN. * Hewlett-Packard call it Source-Port filtering or port-isolation. * Ericsson call it MAC-Forced Forwarding (RFC Draft). * */ static inline int arp_fwd_pvlan(struct in_device *in_dev, struct net_device *dev, struct rtable *rt, __be32 sip, __be32 tip) { /* Private VLAN is only concerned about the same ethernet segment */ if (rt->dst.dev != dev) return 0; /* Don't reply on self probes (often done by windowz boxes)*/ if (sip == tip) return 0; if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) return 1; else return 0; } /* * Interface to link layer: send routine and receive handler. */ /* * Create an arp packet. If dest_hw is not set, we create a broadcast * message. */ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { struct sk_buff *skb; struct arphdr *arp; unsigned char *arp_ptr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; /* * Allocate a buffer */ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); if (!skb) return NULL; skb_reserve(skb, hlen); skb_reset_network_header(skb); arp = skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); if (!src_hw) src_hw = dev->dev_addr; if (!dest_hw) dest_hw = dev->broadcast; /* * Fill the device header for the ARP frame */ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) goto out; /* * Fill out the arp protocol part. * * The arp hardware type should match the device type, except for FDDI, * which (according to RFC 1390) should always equal 1 (Ethernet). */ /* * Exceptions everywhere. AX.25 uses the AX.25 PID value not the * DIX code for the protocol. Make these device structure fields. */ switch (dev->type) { default: arp->ar_hrd = htons(dev->type); arp->ar_pro = htons(ETH_P_IP); break; #if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: arp->ar_hrd = htons(ARPHRD_AX25); arp->ar_pro = htons(AX25_P_IP); break; #if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: arp->ar_hrd = htons(ARPHRD_NETROM); arp->ar_pro = htons(AX25_P_IP); break; #endif #endif #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: arp->ar_hrd = htons(ARPHRD_ETHER); arp->ar_pro = htons(ETH_P_IP); break; #endif } arp->ar_hln = dev->addr_len; arp->ar_pln = 4; arp->ar_op = htons(type); arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; memcpy(arp_ptr, &src_ip, 4); arp_ptr += 4; switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: if (target_hw) memcpy(arp_ptr, target_hw, dev->addr_len); else memset(arp_ptr, 0, dev->addr_len); arp_ptr += dev->addr_len; } memcpy(arp_ptr, &dest_ip, 4); return skb; out: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(arp_create); static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dev_queue_xmit(skb); } /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, dev_net(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); } EXPORT_SYMBOL(arp_xmit); static bool arp_is_garp(struct net *net, struct net_device *dev, int *addr_type, __be16 ar_op, __be32 sip, __be32 tip, unsigned char *sha, unsigned char *tha) { bool is_garp = tip == sip; /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. */ if (is_garp && ar_op == htons(ARPOP_REPLY)) is_garp = /* IPv4 over IEEE 1394 doesn't provide target * hardware address field in its ARP payload. */ tha && !memcmp(tha, sha, dev->addr_len); if (is_garp) { *addr_type = inet_addr_type_dev_table(net, dev, sip); if (*addr_type != RTN_UNICAST) is_garp = false; } return is_garp; } /* * Process an arp request. */ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); struct arphdr *arp; unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; struct neighbour *n; struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. */ if (!in_dev) goto out_free_skb; arp = arp_hdr(skb); switch (dev_type) { default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: /* * ETHERNET, and Fibre Channel (which are IEEE 802 * devices, according to RFC 2625) devices will accept ARP * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). * This is the case also of FDDI, where the RFC 1390 says that * FDDI devices should accept ARP hardware of (1) Ethernet, * however, to be more robust, we'll accept both 1 (Ethernet) * or 6 (IEEE 802.2) */ if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) goto out_free_skb; break; } /* Understand only these message types */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) goto out_free_skb; /* * Extract fields */ arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; switch (dev_type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); /* * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out_free_skb; /* * For some 802.11 wireless deployments (and possibly other networks), * there will be an ARP proxy and gratuitous ARP frames are attacks * and thus should not be accepted. */ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address */ if (dev_type == ARPHRD_DLCI) sha = dev->broadcast; /* * Process entry. The idea here is we want to send a reply if it is a * request for us or if it is a request for someone else that we hold * a proxy for. We want to add an entry to our cache if it is a reply * to us or if it is a request for our address. * (The assumption for this last is that if someone is requesting our * address, they are probably intending to talk to us, so it saves time * if we cache their address. Their address is also probably not in * our cache, since ours is not in their cache.) * * Putting this another way, we only care about replies if they are to * us, in which case we add them to the cache. For requests, we care * about those for us and those for our proxies. We reply to both, * and in the case of requests for us we add the requester to the arp * cache. */ if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) reply_dst = (struct dst_entry *) iptunnel_metadata_reply(skb_metadata_dst(skb), GFP_ATOMIC); /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { int dont_send; dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) dont_send = arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); neigh_release(n); } } goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); goto out_free_dst; } goto out_consume_skb; } } } /* Update our ARP tables */ n = __neigh_lookup(&arp_tbl, &sip, dev, 0); addr_type = -1; if (n || arp_accept(in_dev, sip)) { is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } if (arp_accept(in_dev, sip)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ if (!n && (is_garp || (arp->ar_op == htons(ARPOP_REPLY) && (addr_type == RTN_UNICAST || (addr_type < 0 && /* postpone calculation to as late as possible */ inet_addr_type_dev_table(net, dev, sip) == RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } if (n) { int state = NUD_REACHABLE; int override; /* If several different ARP replies follows back-to-back, use the FIRST one. It is possible, if several proxy agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ override = time_after(jiffies, n->updated + NEIGH_VAR(n->parms, LOCKTIME)) || is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. */ if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0); neigh_release(n); } out_consume_skb: consume_skb(skb); out_free_dst: dst_release(reply_dst); return NET_RX_SUCCESS; out_free_skb: kfree_skb(skb); return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) { arp_process(dev_net(skb->dev), NULL, skb); } static int arp_is_multicast(const void *pkey) { return ipv4_is_multicast(*((__be32 *)pkey)); } /* * Receive an arp request from the device layer. */ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct arphdr *arp; /* do not tweak dropwatch on an ARP we will ignore */ if (dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) goto consumeskb; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out_of_mem; /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ if (!pskb_may_pull(skb, arp_hdr_len(dev))) goto freeskb; arp = arp_hdr(skb); if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) goto freeskb; memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, dev_net(dev), NULL, skb, dev, NULL, arp_process); consumeskb: consume_skb(skb); return NET_RX_SUCCESS; freeskb: kfree_skb(skb); out_of_mem: return NET_RX_DROP; } /* * User level interface (ioctl) */ static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r, bool getarp) { struct net_device *dev; if (getarp) dev = dev_get_by_name_rcu(net, r->arp_dev); else dev = __dev_get_by_name(net, r->arp_dev); if (!dev) return ERR_PTR(-ENODEV); /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */ if (!r->arp_ha.sa_family) r->arp_ha.sa_family = dev->type; if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type) return ERR_PTR(-EINVAL); return dev; } static struct net_device *arp_req_dev(struct net *net, struct arpreq *r) { struct net_device *dev; struct rtable *rt; __be32 ip; if (r->arp_dev[0]) return arp_req_dev_by_name(net, r, false); if (r->arp_flags & ATF_PUBL) return NULL; ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) return ERR_CAST(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return ERR_PTR(-EINVAL); return dev; } /* * Set (create) an ARP cache entry. */ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) { if (!dev) { IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } if (__in_dev_get_rtnl(dev)) { IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); return 0; } return -ENXIO; } static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) return -ENOBUFS; return 0; } return arp_req_set_proxy(net, dev, 1); } static int arp_req_set(struct net *net, struct arpreq *r) { struct neighbour *neigh; struct net_device *dev; __be32 ip; int err; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_set_public(net, r, dev); switch (dev->type) { #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: /* * According to RFC 1390, FDDI devices should accept ARP * hardware types of 1 (Ethernet). However, to be more * robust, we'll accept hardware types of either 1 (Ethernet) * or 6 (IEEE 802.2). */ if (r->arp_ha.sa_family != ARPHRD_FDDI && r->arp_ha.sa_family != ARPHRD_ETHER && r->arp_ha.sa_family != ARPHRD_IEEE802) return -EINVAL; break; #endif default: if (r->arp_ha.sa_family != dev->type) return -EINVAL; break; } ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { unsigned int state = NUD_STALE; if (r->arp_flags & ATF_PERM) { r->arp_flags |= ATF_COM; state = NUD_PERMANENT; } err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } return err; } static unsigned int arp_state_to_flags(struct neighbour *neigh) { if (neigh->nud_state&NUD_PERMANENT) return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) return ATF_COM; else return 0; } /* * Get an ARP cache entry. */ static int arp_req_get(struct net *net, struct arpreq *r) { __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; struct net_device *dev; if (!r->arp_dev[0]) return -ENODEV; dev = arp_req_dev_by_name(net, r, true); if (IS_ERR(dev)) return PTR_ERR(dev); neigh = neigh_lookup(&arp_tbl, &ip, dev); if (!neigh) return -ENXIO; if (READ_ONCE(neigh->nud_state) & NUD_NOARP) { neigh_release(neigh); return -ENXIO; } read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); neigh_release(neigh); r->arp_ha.sa_family = dev->type; netdev_copy_name(dev, r->arp_dev); return 0; } int arp_invalidate(struct net_device *dev, __be32 ip, bool force) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; struct neigh_table *tbl = &arp_tbl; if (neigh) { if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) { neigh_release(neigh); return 0; } if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); } return err; } static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return pneigh_delete(&arp_tbl, net, &ip, dev); } return arp_req_set_proxy(net, dev, 0); } static int arp_req_delete(struct net *net, struct arpreq *r) { struct net_device *dev; __be32 ip; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return arp_invalidate(dev, ip, true); } /* * Handle an ARP layer I/O control request. */ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct arpreq r; __be32 *netmask; int err; switch (cmd) { case SIOCDARP: case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; fallthrough; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) return -EFAULT; break; default: return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr; if (!(r.arp_flags & ATF_NETMASK)) *netmask = htonl(0xFFFFFFFFUL); else if (*netmask && *netmask != htonl(0xFFFFFFFFUL)) return -EINVAL; switch (cmd) { case SIOCDARP: rtnl_lock(); err = arp_req_delete(net, &r); rtnl_unlock(); break; case SIOCSARP: rtnl_lock(); err = arp_req_set(net, &r); rtnl_unlock(); break; case SIOCGARP: rcu_read_lock(); err = arp_req_get(net, &r); rcu_read_unlock(); if (!err && copy_to_user(arg, &r, sizeof(r))) err = -EFAULT; break; } return err; } static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct in_device *in_dev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); rt_cache_flush(dev_net(dev)); break; case NETDEV_CHANGE: change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); in_dev = __in_dev_get_rtnl(dev); if (!in_dev) evict_nocarrier = true; else evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&arp_tbl, dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block arp_netdev_notifier = { .notifier_call = arp_netdev_event, }; /* Note, that it is not on notifier chain. It is necessary, that this routine was called after route cache will be flushed. */ void arp_ifdown(struct net_device *dev) { neigh_ifdown(&arp_tbl, dev); } /* * Called once on startup. */ static struct packet_type arp_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_ARP), .func = arp_rcv, }; #ifdef CONFIG_PROC_FS #if IS_ENABLED(CONFIG_AX25) /* * ax25 -> ASCII conversion */ static void ax2asc2(ax25_address *a, char *buf) { char c, *s; int n; for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; if (c != ' ') *s++ = c; } *s++ = '-'; n = (a->ax25_call[6] >> 1) & 0x0F; if (n > 9) { *s++ = '1'; n -= 10; } *s++ = n + '0'; *s++ = '\0'; if (*buf == '\0' || *buf == '-') { buf[0] = '*'; buf[1] = '\0'; } } #endif /* CONFIG_AX25 */ #define HBUFFERLEN 30 static void arp_format_neigh_entry(struct seq_file *seq, struct neighbour *n) { char hbuffer[HBUFFERLEN]; int k, j; char tbuf[16]; struct net_device *dev = n->dev; int hatype = dev->type; read_lock(&n->lock); /* Convert hardware address to XX:XX:XX:XX ... form. */ #if IS_ENABLED(CONFIG_AX25) if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) ax2asc2((ax25_address *)n->ha, hbuffer); else { #endif for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { hbuffer[k++] = hex_asc_hi(n->ha[j]); hbuffer[k++] = hex_asc_lo(n->ha[j]); hbuffer[k++] = ':'; } if (k != 0) --k; hbuffer[k] = 0; #if IS_ENABLED(CONFIG_AX25) } #endif sprintf(tbuf, "%pI4", n->primary_key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); } static void arp_format_pneigh_entry(struct seq_file *seq, struct pneigh_entry *n) { struct net_device *dev = n->dev; int hatype = dev ? dev->type : 0; char tbuf[16]; sprintf(tbuf, "%pI4", n->key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", dev ? dev->name : "*"); } static int arp_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "IP address HW type Flags " "HW address Mask Device\n"); } else { struct neigh_seq_state *state = seq->private; if (state->flags & NEIGH_SEQ_IS_PNEIGH) arp_format_pneigh_entry(seq, v); else arp_format_neigh_entry(seq, v); } return 0; } static void *arp_seq_start(struct seq_file *seq, loff_t *pos) { /* Don't want to confuse "arp -a" w/ magic entries, * so we tell the generic iterator to skip NUD_NOARP. */ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP); } static const struct seq_operations arp_seq_ops = { .start = arp_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = arp_seq_show, }; #endif /* CONFIG_PROC_FS */ static int __net_init arp_net_init(struct net *net) { if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops, sizeof(struct neigh_seq_state))) return -ENOMEM; return 0; } static void __net_exit arp_net_exit(struct net *net) { remove_proc_entry("arp", net->proc_net); } static struct pernet_operations arp_net_ops = { .init = arp_net_init, .exit = arp_net_exit, }; void __init arp_init(void) { neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); dev_add_pack(&arp_packet_type); register_pernet_subsys(&arp_net_ops); #ifdef CONFIG_SYSCTL neigh_sysctl_register(NULL, &arp_tbl.parms, NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); }
106 95 15 82 78 4 81 1 202 215 202 106 202 208 209 209 207 207 2 1 172 4 205 38 205 2 202 202 202 10 3 18 18 18 82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2003, 2004 * * This file is part of the SCTP kernel implementation * * This file contains the code relating the chunk abstraction. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* This file is mostly in anticipation of future work, but initially * populate with fragment tracking for an outbound message. */ /* Initialize datamsg from memory. */ static void sctp_datamsg_init(struct sctp_datamsg *msg) { refcount_set(&msg->refcnt, 1); msg->send_failed = 0; msg->send_error = 0; msg->can_delay = 1; msg->abandoned = 0; msg->expires_at = 0; INIT_LIST_HEAD(&msg->chunks); } /* Allocate and initialize datamsg. */ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) { struct sctp_datamsg *msg; msg = kmalloc(sizeof(struct sctp_datamsg), gfp); if (msg) { sctp_datamsg_init(msg); SCTP_DBG_OBJCNT_INC(datamsg); } return msg; } void sctp_datamsg_free(struct sctp_datamsg *msg) { struct sctp_chunk *chunk; /* This doesn't have to be a _safe vairant because * sctp_chunk_free() only drops the refs. */ list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_chunk_free(chunk); sctp_datamsg_put(msg); } /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { struct sctp_association *asoc = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_ulpevent *ev; int error, sent; /* Release all references. */ list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); if (!msg->send_failed) { sctp_chunk_put(chunk); continue; } asoc = chunk->asoc; error = msg->send_error ?: asoc->outqueue.error; sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT; if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED)) { ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED_EVENT)) { ev = sctp_ulpevent_make_send_failed_event(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_chunk_put(chunk); } SCTP_DBG_OBJCNT_DEC(datamsg); kfree(msg); } /* Hold a reference. */ static void sctp_datamsg_hold(struct sctp_datamsg *msg) { refcount_inc(&msg->refcnt); } /* Release a reference. */ void sctp_datamsg_put(struct sctp_datamsg *msg) { if (refcount_dec_and_test(&msg->refcnt)) sctp_datamsg_destroy(msg); } /* Assign a chunk to this datamsg. */ static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chunk) { sctp_datamsg_hold(msg); chunk->msg = msg; } /* A data chunk can have a maximum payload of (2^16 - 20). Break * down any such message into smaller chunks. Opportunistically, fragment * the chunks down to the current MTU constraints. We may get refragmented * later if the PMTU changes, but it is _much better_ to fragment immediately * with a reasonable guess than always doing our fragmentation on the * soft-interrupt. */ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct iov_iter *from) { size_t len, first_len, max_data, remaining; size_t msg_len = iov_iter_count(from); struct sctp_shared_key *shkey = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; int err; msg = sctp_datamsg_new(GFP_KERNEL); if (!msg) return ERR_PTR(-ENOMEM); /* Note: Calculate this outside of the loop, so that all fragments * have the same expiration. */ if (asoc->peer.prsctp_capable && sinfo->sinfo_timetolive && (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) || !SCTP_PR_POLICY(sinfo->sinfo_flags))) msg->expires_at = jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); /* This is the biggest possible DATA chunk that can fit into * the packet */ max_data = asoc->frag_point; if (unlikely(!max_data)) { max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), sctp_datachk_len(&asoc->stream)); pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%zu)", __func__, asoc, max_data); } /* If the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + hmac_desc->hmac_len); if (sinfo->sinfo_tsn && sinfo->sinfo_ssn != asoc->active_key_id) { shkey = sctp_auth_get_shkey(asoc, sinfo->sinfo_ssn); if (!shkey) { err = -EINVAL; goto errout; } } else { shkey = asoc->shkey; } } /* Set first_len and then account for possible bundles on first frag */ first_len = max_data; /* Check to see if we have a pending SACK and try to let it be bundled * with this message. Do this if we don't have any data queued already. * To check that, look at out_qlen and retransmit list. * NOTE: we will not reduce to account for SACK, if the message would * not have been fragmented. */ if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && msg_len > max_data) first_len -= SCTP_PAD4(sizeof(struct sctp_sack_chunk)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; /* Account for a different sized first fragment */ if (msg_len >= first_len) { msg->can_delay = 0; if (msg_len > first_len) SCTP_INC_STATS(asoc->base.net, SCTP_MIB_FRAGUSRMSGS); } else { /* Which may be the only one... */ first_len = msg_len; } /* Create chunks for all DATA chunks. */ for (remaining = msg_len; remaining; remaining -= len) { u8 frag = SCTP_DATA_MIDDLE_FRAG; if (remaining == msg_len) { /* First frag, which may also be the last */ frag |= SCTP_DATA_FIRST_FRAG; len = first_len; } else { /* Middle frags */ len = max_data; } if (len >= remaining) { /* Last frag, which may also be the first */ len = remaining; frag |= SCTP_DATA_LAST_FRAG; /* The application requests to set the I-bit of the * last DATA chunk of a user message when providing * the user message to the SCTP implementation. */ if ((sinfo->sinfo_flags & SCTP_EOF) || (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) frag |= SCTP_DATA_SACK_IMM; } chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag, GFP_KERNEL); if (!chunk) { err = -ENOMEM; goto errout; } err = sctp_user_addto_chunk(chunk, len, from); if (err < 0) goto errout_chunk_free; chunk->shkey = shkey; /* Put the chunk->skb back into the form expected by send. */ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - chunk->skb->data); sctp_datamsg_assign(msg, chunk); list_add_tail(&chunk->frag_list, &msg->chunks); } return msg; errout_chunk_free: sctp_chunk_free(chunk); errout: list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); sctp_chunk_free(chunk); } sctp_datamsg_put(msg); return ERR_PTR(err); } /* Check whether this message has expired. */ int sctp_chunk_abandoned(struct sctp_chunk *chunk) { if (!chunk->asoc->peer.prsctp_capable) return 0; if (chunk->msg->abandoned) return 1; if (!chunk->has_tsn && !(chunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)) return 0; if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++; } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } chunk->msg->abandoned = 1; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; chunk->msg->abandoned = 1; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && time_after(jiffies, chunk->msg->expires_at)) { chunk->msg->abandoned = 1; return 1; } /* PRIO policy is processed by sendmsg, not here */ return 0; } /* This chunk (and consequently entire message) has failed in its sending. */ void sctp_chunk_fail(struct sctp_chunk *chunk, int error) { chunk->msg->send_failed = 1; chunk->msg->send_error = error; }
5 5 5 5 5 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 // SPDX-License-Identifier: GPL-2.0 /* * Provide a default dump_stack() function for architectures * which don't implement their own. */ #include <linux/kernel.h> #include <linux/buildid.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/smp.h> #include <linux/atomic.h> #include <linux/kexec.h> #include <linux/utsname.h> #include <linux/stop_machine.h> static char dump_stack_arch_desc_str[128]; /** * dump_stack_set_arch_desc - set arch-specific str to show with task dumps * @fmt: printf-style format string * @...: arguments for the format string * * The configured string will be printed right after utsname during task * dumps. Usually used to add arch-specific system identifiers. If an * arch wants to make use of such an ID string, it should initialize this * as soon as possible during boot. */ void __init dump_stack_set_arch_desc(const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), fmt, args); va_end(args); } #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) #define BUILD_ID_FMT " %20phN" #define BUILD_ID_VAL vmlinux_build_id #else #define BUILD_ID_FMT "%s" #define BUILD_ID_VAL "" #endif /** * dump_stack_print_info - print generic debug info for dump_stack() * @log_lvl: log level * * Arch-specific dump_stack() implementations can use this function to * print out the same debug information as the generic dump_stack(). */ void dump_stack_print_info(const char *log_lvl) { printk("%sCPU: %d UID: %u PID: %d Comm: %.20s %s%s %s %.*s" BUILD_ID_FMT "\n", log_lvl, raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), current->pid, current->comm, kexec_crash_loaded() ? "Kdump: loaded " : "", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version, BUILD_ID_VAL); if (get_taint()) printk("%s%s\n", log_lvl, print_tainted_verbose()); if (dump_stack_arch_desc_str[0] != '\0') printk("%sHardware name: %s\n", log_lvl, dump_stack_arch_desc_str); print_worker_info(log_lvl, current); print_stop_info(log_lvl, current); print_scx_info(log_lvl, current); } /** * show_regs_print_info - print generic debug info for show_regs() * @log_lvl: log level * * show_regs() implementations can use this function to print out generic * debug information. */ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); } static void __dump_stack(const char *log_lvl) { dump_stack_print_info(log_lvl); show_stack(NULL, NULL, log_lvl); } /** * dump_stack_lvl - dump the current task information and its stack trace * @log_lvl: log level * * Architectures can override this implementation by implementing its own. */ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { bool in_panic = this_cpu_in_panic(); unsigned long flags; /* * Permit this cpu to perform nested stack dumps while serialising * against other CPUs, unless this CPU is in panic. * * When in panic, non-panic CPUs are not permitted to store new * printk messages so there is no need to synchronize the output. * This avoids potential deadlock in panic() if another CPU is * holding and unable to release the printk_cpu_sync. */ if (!in_panic) printk_cpu_sync_get_irqsave(flags); __dump_stack(log_lvl); if (!in_panic) printk_cpu_sync_put_irqrestore(flags); } EXPORT_SYMBOL(dump_stack_lvl); asmlinkage __visible void dump_stack(void) { dump_stack_lvl(KERN_DEFAULT); } EXPORT_SYMBOL(dump_stack);
57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNWIND_H #define _ASM_X86_UNWIND_H #include <linux/sched.h> #include <linux/ftrace.h> #include <linux/rethook.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> #define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) #define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; struct task_struct *task; int graph_idx; #if defined(CONFIG_RETHOOK) struct llist_node *kr_cur; #endif bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; /* * If non-NULL: The current frame is incomplete and doesn't contain a * valid BP. When looking for the next frame, use this instead of the * non-existent saved BP. */ unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; #endif }; void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); bool unwind_next_frame(struct unwind_state *state); unsigned long unwind_get_return_address(struct unwind_state *state); unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } static inline bool unwind_error(struct unwind_state *state) { return state->error; } static inline void unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { first_frame = first_frame ? : get_stack_pointer(task, regs); __unwind_start(state, task, regs, first_frame); } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) /* * If 'partial' returns true, only the iret frame registers are valid. */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { if (unwind_done(state)) return NULL; if (partial) { #ifdef CONFIG_UNWINDER_ORC *partial = !state->full_regs; #else *partial = false; #endif } return state->regs; } #else static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { return NULL; } #endif #ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); #else static inline void unwind_init(void) {} static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} #endif static inline unsigned long unwind_recover_rethook(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { #ifdef CONFIG_RETHOOK if (is_rethook_trampoline(addr)) return rethook_find_ret_addr(state->task, (unsigned long)addr_p, &state->kr_cur); #endif return addr; } /* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { unsigned long ret; ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); return unwind_recover_rethook(state, ret, addr_p); } /* * This disables KASAN checking when reading a value from another task's stack, * since the other task could be running on another CPU and could have poisoned * the stack in the meantime. */ #define READ_ONCE_TASK_STACK(task, x) \ ({ \ unsigned long val; \ if (task == current) \ val = READ_ONCE(x); \ else \ val = READ_ONCE_NOCHECK(x); \ val; \ }) static inline bool task_on_another_cpu(struct task_struct *task) { #ifdef CONFIG_SMP return task != current && task->on_cpu; #else return false; #endif } #endif /* _ASM_X86_UNWIND_H */
30 27 30 30 30 30 29 30 30 30 30 30 30 30 30 17 17 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 // SPDX-License-Identifier: GPL-2.0-or-later /* incoming call handling * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/errqueue.h> #include <linux/udp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/icmp.h> #include <linux/gfp.h> #include <linux/circ_buf.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> #include "ar-internal.h" static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call, unsigned long user_call_ID) { } /* * Preallocate a single service call, connection and peer and, if possible, * give them a user ID and attach the user's side of the ID to them. */ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, struct rxrpc_backlog *b, rxrpc_notify_rx_t notify_rx, rxrpc_user_attach_call_t user_attach_call, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { struct rxrpc_call *call, *xcall; struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); struct rb_node *parent, **pp; int max, tmp; unsigned int size = RXRPC_BACKLOG_MAX; unsigned int head, tail, call_head, call_tail; max = rx->sk.sk_max_ack_backlog; tmp = rx->sk.sk_ack_backlog; if (tmp >= max) { _leave(" = -ENOBUFS [full %u]", max); return -ENOBUFS; } max -= tmp; /* We don't need more conns and peers than we have calls, but on the * other hand, we shouldn't ever use more peers than conns or conns * than calls. */ call_head = b->call_backlog_head; call_tail = READ_ONCE(b->call_backlog_tail); tmp = CIRC_CNT(call_head, call_tail, size); if (tmp >= max) { _leave(" = -ENOBUFS [enough %u]", tmp); return -ENOBUFS; } max = tmp + 1; head = b->peer_backlog_head; tail = READ_ONCE(b->peer_backlog_tail); if (CIRC_CNT(head, tail, size) < max) { struct rxrpc_peer *peer; peer = rxrpc_alloc_peer(rx->local, gfp, rxrpc_peer_new_prealloc); if (!peer) return -ENOMEM; b->peer_backlog[head] = peer; smp_store_release(&b->peer_backlog_head, (head + 1) & (size - 1)); } head = b->conn_backlog_head; tail = READ_ONCE(b->conn_backlog_tail); if (CIRC_CNT(head, tail, size) < max) { struct rxrpc_connection *conn; conn = rxrpc_prealloc_service_connection(rxnet, gfp); if (!conn) return -ENOMEM; b->conn_backlog[head] = conn; smp_store_release(&b->conn_backlog_head, (head + 1) & (size - 1)); } /* Now it gets complicated, because calls get registered with the * socket here, with a user ID preassigned by the user. */ call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) return -ENOMEM; call->flags |= (1 << RXRPC_CALL_IS_SERVICE); rxrpc_set_call_state(call, RXRPC_CALL_SERVER_PREALLOC); __set_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), user_call_ID, rxrpc_call_new_prealloc_service); write_lock(&rx->call_lock); /* Check the user ID isn't already in use */ pp = &rx->calls.rb_node; parent = NULL; while (*pp) { parent = *pp; xcall = rb_entry(parent, struct rxrpc_call, sock_node); if (user_call_ID < xcall->user_call_ID) pp = &(*pp)->rb_left; else if (user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else goto id_in_use; } call->user_call_ID = user_call_ID; call->notify_rx = notify_rx; if (user_attach_call) { rxrpc_get_call(call, rxrpc_call_get_kernel_service); user_attach_call(call, user_call_ID); } rxrpc_get_call(call, rxrpc_call_get_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); set_bit(RXRPC_CALL_HAS_USERID, &call->flags); list_add(&call->sock_link, &rx->sock_calls); write_unlock(&rx->call_lock); rxnet = call->rxnet; spin_lock(&rxnet->call_lock); list_add_tail_rcu(&call->link, &rxnet->calls); spin_unlock(&rxnet->call_lock); b->call_backlog[call_head] = call; smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1)); _leave(" = 0 [%d -> %lx]", call->debug_id, user_call_ID); return 0; id_in_use: write_unlock(&rx->call_lock); rxrpc_cleanup_call(call); _leave(" = -EBADSLT"); return -EBADSLT; } /* * Allocate the preallocation buffers for incoming service calls. These must * be charged manually. */ int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp) { struct rxrpc_backlog *b = rx->backlog; if (!b) { b = kzalloc(sizeof(struct rxrpc_backlog), gfp); if (!b) return -ENOMEM; rx->backlog = b; } return 0; } /* * Discard the preallocation on a service. */ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) { struct rxrpc_backlog *b = rx->backlog; struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); unsigned int size = RXRPC_BACKLOG_MAX, head, tail; if (!b) return; rx->backlog = NULL; /* Make sure that there aren't any incoming calls in progress before we * clear the preallocation buffers. */ spin_lock(&rx->incoming_lock); spin_unlock(&rx->incoming_lock); head = b->peer_backlog_head; tail = b->peer_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_peer *peer = b->peer_backlog[tail]; rxrpc_put_local(peer->local, rxrpc_local_put_prealloc_peer); kfree(peer); tail = (tail + 1) & (size - 1); } head = b->conn_backlog_head; tail = b->conn_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_connection *conn = b->conn_backlog[tail]; write_lock(&rxnet->conn_lock); list_del(&conn->link); list_del(&conn->proc_link); write_unlock(&rxnet->conn_lock); kfree(conn); if (atomic_dec_and_test(&rxnet->nr_conns)) wake_up_var(&rxnet->nr_conns); tail = (tail + 1) & (size - 1); } head = b->call_backlog_head; tail = b->call_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; rcu_assign_pointer(call->socket, rx); if (rx->discard_new_call) { _debug("discard %lx", call->user_call_ID); rx->discard_new_call(call, call->user_call_ID); if (call->notify_rx) call->notify_rx = rxrpc_dummy_notify; rxrpc_put_call(call, rxrpc_call_put_kernel); } rxrpc_call_completed(call); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put_discard_prealloc); tail = (tail + 1) & (size - 1); } kfree(b); } /* * Allocate a new incoming call from the prealloc pool, along with a connection * and a peer as necessary. */ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_local *local, struct rxrpc_peer *peer, struct rxrpc_connection *conn, const struct rxrpc_security *sec, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; struct rxrpc_call *call; unsigned short call_head, conn_head, peer_head; unsigned short call_tail, conn_tail, peer_tail; unsigned short call_count, conn_count; /* #calls >= #conns >= #peers must hold true. */ call_head = smp_load_acquire(&b->call_backlog_head); call_tail = b->call_backlog_tail; call_count = CIRC_CNT(call_head, call_tail, RXRPC_BACKLOG_MAX); conn_head = smp_load_acquire(&b->conn_backlog_head); conn_tail = b->conn_backlog_tail; conn_count = CIRC_CNT(conn_head, conn_tail, RXRPC_BACKLOG_MAX); ASSERTCMP(conn_count, >=, call_count); peer_head = smp_load_acquire(&b->peer_backlog_head); peer_tail = b->peer_backlog_tail; ASSERTCMP(CIRC_CNT(peer_head, peer_tail, RXRPC_BACKLOG_MAX), >=, conn_count); if (call_count == 0) return NULL; if (!conn) { if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_service_conn)) peer = NULL; if (!peer) { peer = b->peer_backlog[peer_tail]; peer->srx = *peer_srx; b->peer_backlog[peer_tail] = NULL; smp_store_release(&b->peer_backlog_tail, (peer_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); rxrpc_new_incoming_peer(local, peer); } /* Now allocate and set up the connection */ conn = b->conn_backlog[conn_tail]; b->conn_backlog[conn_tail] = NULL; smp_store_release(&b->conn_backlog_tail, (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); conn->local = rxrpc_get_local(local, rxrpc_local_get_prealloc_conn); conn->peer = peer; rxrpc_see_connection(conn, rxrpc_conn_see_new_service_conn); rxrpc_new_incoming_connection(rx, conn, sec, skb); } else { rxrpc_get_connection(conn, rxrpc_conn_get_service_conn); atomic_inc(&conn->active); } /* And now we can allocate and set up a new call */ call = b->call_backlog[call_tail]; b->call_backlog[call_tail] = NULL; smp_store_release(&b->call_backlog_tail, (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); rxrpc_see_call(call, rxrpc_call_see_accept); call->local = rxrpc_get_local(conn->local, rxrpc_local_get_call); call->conn = conn; call->security = conn->security; call->security_ix = conn->security_ix; call->peer = rxrpc_get_peer(conn->peer, rxrpc_peer_get_accept); call->dest_srx = peer->srx; call->cong_ssthresh = call->peer->cong_ssthresh; call->tx_last_sent = ktime_get_real(); return call; } /* * Set up a new incoming call. Called from the I/O thread. * * If this is for a kernel service, when we allocate the call, it will have * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the * retainer ref obtained from the backlog buffer. Prealloc calls for userspace * services only have the ref from the backlog buffer. * * If we want to report an error, we mark the skb with the packet type and * abort code and return false. */ bool rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb) { const struct rxrpc_security *sec = NULL; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_call *call = NULL; struct rxrpc_sock *rx; _enter(""); /* Don't set up a call for anything other than a DATA packet. */ if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA) return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call); read_lock(&local->services_lock); /* Weed out packets to services we're not offering. Packets that would * begin a call are explicitly rejected and the rest are just * discarded. */ rx = local->service; if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && sp->hdr.serviceId != rx->second_service) ) { if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && sp->hdr.seq == 1) goto unsupported_service; goto discard; } if (!conn) { sec = rxrpc_get_incoming_security(rx, skb); if (!sec) goto unsupported_security; } spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { rxrpc_direct_abort(skb, rxrpc_abort_shut_down, RX_INVALID_OPERATION, -ESHUTDOWN); goto no_call; } call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, peer_srx, skb); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; goto no_call; } trace_rxrpc_receive(call, rxrpc_receive_incoming, sp->hdr.serial, sp->hdr.seq); /* Make the call live. */ rxrpc_incoming_call(rx, call, skb); conn = call->conn; if (rx->notify_new_call) rx->notify_new_call(&rx->sk, call, call->user_call_ID); spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) { conn->state = RXRPC_CONN_SERVICE_CHALLENGING; set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); rxrpc_queue_conn(call->conn, rxrpc_conn_queue_challenge); } spin_unlock(&conn->state_lock); spin_unlock(&rx->incoming_lock); read_unlock(&local->services_lock); if (hlist_unhashed(&call->error_link)) { spin_lock(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); spin_unlock(&call->peer->lock); } _leave(" = %p{%d}", call, call->debug_id); rxrpc_input_call_event(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); return true; unsupported_service: read_unlock(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EOPNOTSUPP); unsupported_security: read_unlock(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); read_unlock(&local->services_lock); _leave(" = f [%u]", skb->mark); return false; discard: read_unlock(&local->services_lock); return true; } /* * Charge up socket with preallocated calls, attaching user call IDs. */ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) { struct rxrpc_backlog *b = rx->backlog; if (rx->sk.sk_state == RXRPC_CLOSE) return -ESHUTDOWN; return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); } /* * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls * @sock: The socket on which to preallocate * @notify_rx: Event notification function for the call * @user_attach_call: Func to attach call to user_call_ID * @user_call_ID: The tag to attach to the preallocated call * @gfp: The allocation conditions. * @debug_id: The tracing debug ID. * * Charge up the socket with preallocated calls, each with a user ID. A * function should be provided to effect the attachment from the user's side. * The user is given a ref to hold on the call. * * Note that the call may be come connected before this function returns. */ int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, rxrpc_user_attach_call_t user_attach_call, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct rxrpc_backlog *b = rx->backlog; if (sock->sk->sk_state == RXRPC_CLOSE) return -ESHUTDOWN; return rxrpc_service_prealloc_one(rx, b, notify_rx, user_attach_call, user_call_ID, gfp, debug_id); } EXPORT_SYMBOL(rxrpc_kernel_charge_accept);
30 3 27 27 27 27 31 4 9 30 1 26 3 1 2 2 2 2 2 20 13 7 11 9 3 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 // SPDX-License-Identifier: GPL-2.0-only /* * File: af_phonet.c * * Phonet protocols family * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/unaligned.h> #include <net/sock.h> #include <linux/if_phonet.h> #include <linux/phonet.h> #include <net/phonet/phonet.h> #include <net/phonet/pn_dev.h> /* Transport protocol registration */ static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; static const struct phonet_protocol *phonet_proto_get(unsigned int protocol) { const struct phonet_protocol *pp; if (protocol >= PHONET_NPROTO) return NULL; rcu_read_lock(); pp = rcu_dereference(proto_tab[protocol]); if (pp && !try_module_get(pp->prot->owner)) pp = NULL; rcu_read_unlock(); return pp; } static inline void phonet_proto_put(const struct phonet_protocol *pp) { module_put(pp->prot->owner); } /* protocol family functions */ static int pn_socket_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct pn_sock *pn; const struct phonet_protocol *pnp; int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (protocol == 0) { /* Default protocol selection */ switch (sock->type) { case SOCK_DGRAM: protocol = PN_PROTO_PHONET; break; case SOCK_SEQPACKET: protocol = PN_PROTO_PIPE; break; default: return -EPROTONOSUPPORT; } } pnp = phonet_proto_get(protocol); if (pnp == NULL && request_module("net-pf-%d-proto-%d", PF_PHONET, protocol) == 0) pnp = phonet_proto_get(protocol); if (pnp == NULL) return -EPROTONOSUPPORT; if (sock->type != pnp->sock_type) { err = -EPROTONOSUPPORT; goto out; } sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern); if (sk == NULL) { err = -ENOMEM; goto out; } sock_init_data(sock, sk); sock->state = SS_UNCONNECTED; sock->ops = pnp->ops; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; sk->sk_protocol = protocol; pn = pn_sk(sk); pn->sobject = 0; pn->dobject = 0; pn->resource = 0; sk->sk_prot->init(sk); err = 0; out: phonet_proto_put(pnp); return err; } static const struct net_proto_family phonet_proto_family = { .family = PF_PHONET, .create = pn_socket_create, .owner = THIS_MODULE, }; /* Phonet device header operations */ static int pn_header_create(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { u8 *media = skb_push(skb, 1); if (type != ETH_P_PHONET) return -1; if (!saddr) saddr = dev->dev_addr; *media = *(const u8 *)saddr; return 1; } static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr) { const u8 *media = skb_mac_header(skb); *haddr = *media; return 1; } const struct header_ops phonet_header_ops = { .create = pn_header_create, .parse = pn_header_parse, }; EXPORT_SYMBOL(phonet_header_ops); /* * Prepends an ISI header and sends a datagram. */ static int pn_send(struct sk_buff *skb, struct net_device *dev, u16 dst, u16 src, u8 res) { struct phonethdr *ph; int err; if (skb->len + 2 > 0xffff /* Phonet length field limit */ || skb->len + sizeof(struct phonethdr) > dev->mtu) { err = -EMSGSIZE; goto drop; } /* Broadcast sending is not implemented */ if (pn_addr(dst) == PNADDR_BROADCAST) { err = -EOPNOTSUPP; goto drop; } skb_reset_transport_header(skb); WARN_ON(skb_headroom(skb) & 1); /* HW assumes word alignment */ skb_push(skb, sizeof(struct phonethdr)); skb_reset_network_header(skb); ph = pn_hdr(skb); ph->pn_rdev = pn_dev(dst); ph->pn_sdev = pn_dev(src); ph->pn_res = res; ph->pn_length = __cpu_to_be16(skb->len + 2 - sizeof(*ph)); ph->pn_robj = pn_obj(dst); ph->pn_sobj = pn_obj(src); skb->protocol = htons(ETH_P_PHONET); skb->priority = 0; skb->dev = dev; if (skb->pkt_type == PACKET_LOOPBACK) { skb_reset_mac_header(skb); skb_orphan(skb); err = netif_rx(skb) ? -ENOBUFS : 0; } else { err = dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len); if (err < 0) { err = -EHOSTUNREACH; goto drop; } err = dev_queue_xmit(skb); if (unlikely(err > 0)) err = net_xmit_errno(err); } return err; drop: kfree_skb(skb); return err; } static int pn_raw_send(const void *data, int len, struct net_device *dev, u16 dst, u16 src, u8 res) { struct sk_buff *skb = alloc_skb(MAX_PHONET_HEADER + len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; if (phonet_address_lookup(dev_net(dev), pn_addr(dst)) == 0) skb->pkt_type = PACKET_LOOPBACK; skb_reserve(skb, MAX_PHONET_HEADER); __skb_put(skb, len); skb_copy_to_linear_data(skb, data, len); return pn_send(skb, dev, dst, src, res); } /* * Create a Phonet header for the skb and send it out. Returns * non-zero error code if failed. The skb is freed then. */ int pn_skb_send(struct sock *sk, struct sk_buff *skb, const struct sockaddr_pn *target) { struct net *net = sock_net(sk); struct net_device *dev; struct pn_sock *pn = pn_sk(sk); int err; u16 src, dst; u8 daddr, saddr, res; src = pn->sobject; if (target != NULL) { dst = pn_sockaddr_get_object(target); res = pn_sockaddr_get_resource(target); } else { dst = pn->dobject; res = pn->resource; } daddr = pn_addr(dst); err = -EHOSTUNREACH; if (sk->sk_bound_dev_if) dev = dev_get_by_index(net, sk->sk_bound_dev_if); else if (phonet_address_lookup(net, daddr) == 0) { dev = phonet_device_get(net); skb->pkt_type = PACKET_LOOPBACK; } else if (dst == 0) { /* Resource routing (small race until phonet_rcv()) */ struct sock *sk = pn_find_sock_by_res(net, res); if (sk) { sock_put(sk); dev = phonet_device_get(net); skb->pkt_type = PACKET_LOOPBACK; } else dev = phonet_route_output(net, daddr); } else dev = phonet_route_output(net, daddr); if (!dev || !(dev->flags & IFF_UP)) goto drop; saddr = phonet_address_get(dev, daddr); if (saddr == PN_NO_ADDR) goto drop; if (!pn_addr(src)) src = pn_object(saddr, pn_obj(src)); err = pn_send(skb, dev, dst, src, res); dev_put(dev); return err; drop: kfree_skb(skb); dev_put(dev); return err; } EXPORT_SYMBOL(pn_skb_send); /* Do not send an error message in response to an error message */ static inline int can_respond(struct sk_buff *skb) { const struct phonethdr *ph; const struct phonetmsg *pm; u8 submsg_id; if (!pskb_may_pull(skb, 3)) return 0; ph = pn_hdr(skb); if (ph->pn_res == PN_PREFIX && !pskb_may_pull(skb, 5)) return 0; if (ph->pn_res == PN_COMMGR) /* indications */ return 0; ph = pn_hdr(skb); /* re-acquires the pointer */ pm = pn_msg(skb); if (pm->pn_msg_id != PN_COMMON_MESSAGE) return 1; submsg_id = (ph->pn_res == PN_PREFIX) ? pm->pn_e_submsg_id : pm->pn_submsg_id; if (submsg_id != PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP && pm->pn_e_submsg_id != PN_COMM_SERVICE_NOT_IDENTIFIED_RESP) return 1; return 0; } static int send_obj_unreachable(struct sk_buff *rskb) { const struct phonethdr *oph = pn_hdr(rskb); const struct phonetmsg *opm = pn_msg(rskb); struct phonetmsg resp; memset(&resp, 0, sizeof(resp)); resp.pn_trans_id = opm->pn_trans_id; resp.pn_msg_id = PN_COMMON_MESSAGE; if (oph->pn_res == PN_PREFIX) { resp.pn_e_res_id = opm->pn_e_res_id; resp.pn_e_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP; resp.pn_e_orig_msg_id = opm->pn_msg_id; resp.pn_e_status = 0; } else { resp.pn_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP; resp.pn_orig_msg_id = opm->pn_msg_id; resp.pn_status = 0; } return pn_raw_send(&resp, sizeof(resp), rskb->dev, pn_object(oph->pn_sdev, oph->pn_sobj), pn_object(oph->pn_rdev, oph->pn_robj), oph->pn_res); } static int send_reset_indications(struct sk_buff *rskb) { struct phonethdr *oph = pn_hdr(rskb); static const u8 data[4] = { 0x00 /* trans ID */, 0x10 /* subscribe msg */, 0x00 /* subscription count */, 0x00 /* dummy */ }; return pn_raw_send(data, sizeof(data), rskb->dev, pn_object(oph->pn_sdev, 0x00), pn_object(oph->pn_rdev, oph->pn_robj), PN_COMMGR); } /* packet type functions */ /* * Stuff received packets to associated sockets. * On error, returns non-zero and releases the skb. */ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pkttype, struct net_device *orig_dev) { struct net *net = dev_net(dev); struct phonethdr *ph; struct sockaddr_pn sa; u16 len; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; /* check we have at least a full Phonet header */ if (!pskb_pull(skb, sizeof(struct phonethdr))) goto out; /* check that the advertised length is correct */ ph = pn_hdr(skb); len = get_unaligned_be16(&ph->pn_length); if (len < 2) goto out; len -= 2; if ((len > skb->len) || pskb_trim(skb, len)) goto out; skb_reset_transport_header(skb); pn_skb_get_dst_sockaddr(skb, &sa); /* check if this is broadcasted */ if (pn_sockaddr_get_addr(&sa) == PNADDR_BROADCAST) { pn_deliver_sock_broadcast(net, skb); goto out; } /* resource routing */ if (pn_sockaddr_get_object(&sa) == 0) { struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource); if (sk) return sk_receive_skb(sk, skb, 0); } /* check if we are the destination */ if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) { /* Phonet packet input */ struct sock *sk = pn_find_sock_by_sa(net, &sa); if (sk) return sk_receive_skb(sk, skb, 0); if (can_respond(skb)) { send_obj_unreachable(skb); send_reset_indications(skb); } } else if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) goto out; /* Race between address deletion and loopback */ else { /* Phonet packet routing */ struct net_device *out_dev; out_dev = phonet_route_output(net, pn_sockaddr_get_addr(&sa)); if (!out_dev) { net_dbg_ratelimited("No Phonet route to %02X\n", pn_sockaddr_get_addr(&sa)); goto out; } __skb_push(skb, sizeof(struct phonethdr)); skb->dev = out_dev; if (out_dev == dev) { net_dbg_ratelimited("Phonet loop to %02X on %s\n", pn_sockaddr_get_addr(&sa), dev->name); goto out_dev; } /* Some drivers (e.g. TUN) do not allocate HW header space */ if (skb_cow_head(skb, out_dev->hard_header_len)) goto out_dev; if (dev_hard_header(skb, out_dev, ETH_P_PHONET, NULL, NULL, skb->len) < 0) goto out_dev; dev_queue_xmit(skb); dev_put(out_dev); return NET_RX_SUCCESS; out_dev: dev_put(out_dev); } out: kfree_skb(skb); return NET_RX_DROP; } static struct packet_type phonet_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_PHONET), .func = phonet_rcv, }; static DEFINE_MUTEX(proto_tab_lock); int __init_or_module phonet_proto_register(unsigned int protocol, const struct phonet_protocol *pp) { int err = 0; if (protocol >= PHONET_NPROTO) return -EINVAL; err = proto_register(pp->prot, 1); if (err) return err; mutex_lock(&proto_tab_lock); if (proto_tab[protocol]) err = -EBUSY; else rcu_assign_pointer(proto_tab[protocol], pp); mutex_unlock(&proto_tab_lock); return err; } EXPORT_SYMBOL(phonet_proto_register); void phonet_proto_unregister(unsigned int protocol, const struct phonet_protocol *pp) { mutex_lock(&proto_tab_lock); BUG_ON(proto_tab[protocol] != pp); RCU_INIT_POINTER(proto_tab[protocol], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); proto_unregister(pp->prot); } EXPORT_SYMBOL(phonet_proto_unregister); /* Module registration */ static int __init phonet_init(void) { int err; err = phonet_device_init(); if (err) return err; pn_sock_init(); err = sock_register(&phonet_proto_family); if (err) { printk(KERN_ALERT "phonet protocol family initialization failed\n"); goto err_sock; } dev_add_pack(&phonet_packet_type); phonet_sysctl_init(); err = isi_register(); if (err) goto err; return 0; err: phonet_sysctl_exit(); sock_unregister(PF_PHONET); dev_remove_pack(&phonet_packet_type); err_sock: phonet_device_exit(); return err; } static void __exit phonet_exit(void) { isi_unregister(); phonet_sysctl_exit(); sock_unregister(PF_PHONET); dev_remove_pack(&phonet_packet_type); phonet_device_exit(); } module_init(phonet_init); module_exit(phonet_exit); MODULE_DESCRIPTION("Phonet protocol stack for Linux"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PHONET);
2101 2102 2100 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 // SPDX-License-Identifier: GPL-2.0-only // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner #include <linux/atomic.h> #include <linux/bug.h> #include <linux/console.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/init.h> #include <linux/irqflags.h> #include <linux/kthread.h> #include <linux/minmax.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include "internal.h" #include "printk_ringbuffer.h" /* * Printk console printing implementation for consoles which does not depend * on the legacy style console_lock mechanism. * * The state of the console is maintained in the "nbcon_state" atomic * variable. * * The console is locked when: * * - The 'prio' field contains the priority of the context that owns the * console. Only higher priority contexts are allowed to take over the * lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked. * * - The 'cpu' field denotes on which CPU the console is locked. It is used * to prevent busy waiting on the same CPU. Also it informs the lock owner * that it has lost the lock in a more complex scenario when the lock was * taken over by a higher priority context, released, and taken on another * CPU with the same priority as the interrupted owner. * * The acquire mechanism uses a few more fields: * * - The 'req_prio' field is used by the handover approach to make the * current owner aware that there is a context with a higher priority * waiting for the friendly handover. * * - The 'unsafe' field allows to take over the console in a safe way in the * middle of emitting a message. The field is set only when accessing some * shared resources or when the console device is manipulated. It can be * cleared, for example, after emitting one character when the console * device is in a consistent state. * * - The 'unsafe_takeover' field is set when a hostile takeover took the * console in an unsafe state. The console will stay in the unsafe state * until re-initialized. * * The acquire mechanism uses three approaches: * * 1) Direct acquire when the console is not owned or is owned by a lower * priority context and is in a safe state. * * 2) Friendly handover mechanism uses a request/grant handshake. It is used * when the current owner has lower priority and the console is in an * unsafe state. * * The requesting context: * * a) Sets its priority into the 'req_prio' field. * * b) Waits (with a timeout) for the owning context to unlock the * console. * * c) Takes the lock and clears the 'req_prio' field. * * The owning context: * * a) Observes the 'req_prio' field set on exit from the unsafe * console state. * * b) Gives up console ownership by clearing the 'prio' field. * * 3) Unsafe hostile takeover allows to take over the lock even when the * console is an unsafe state. It is used only in panic() by the final * attempt to flush consoles in a try and hope mode. * * Note that separate record buffers are used in panic(). As a result, * the messages can be read and formatted without any risk even after * using the hostile takeover in unsafe state. * * The release function simply clears the 'prio' field. * * All operations on @console::nbcon_state are atomic cmpxchg based to * handle concurrency. * * The acquire/release functions implement only minimal policies: * * - Preference for higher priority contexts. * - Protection of the panic CPU. * * All other policy decisions must be made at the call sites: * * - What is marked as an unsafe section. * - Whether to spin-wait if there is already an owner and the console is * in an unsafe state. * - Whether to attempt an unsafe hostile takeover. * * The design allows to implement the well known: * * acquire() * output_one_printk_record() * release() * * The output of one printk record might be interrupted with a higher priority * context. The new owner is supposed to reprint the entire interrupted record * from scratch. */ /** * nbcon_state_set - Helper function to set the console state * @con: Console to update * @new: The new state to write * * Only to be used when the console is not yet or no longer visible in the * system. Otherwise use nbcon_state_try_cmpxchg(). */ static inline void nbcon_state_set(struct console *con, struct nbcon_state *new) { atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom); } /** * nbcon_state_read - Helper function to read the console state * @con: Console to read * @state: The state to store the result */ static inline void nbcon_state_read(struct console *con, struct nbcon_state *state) { state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state)); } /** * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state * @con: Console to update * @cur: Old/expected state * @new: New state * * Return: True on success. False on fail and @cur is updated. */ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur, struct nbcon_state *new) { return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom); } /** * nbcon_seq_read - Read the current console sequence * @con: Console to read the sequence of * * Return: Sequence number of the next record to print on @con. */ u64 nbcon_seq_read(struct console *con) { unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq)); return __ulseq_to_u64seq(prb, nbcon_seq); } /** * nbcon_seq_force - Force console sequence to a specific value * @con: Console to work on * @seq: Sequence number value to set * * Only to be used during init (before registration) or in extreme situations * (such as panic with CONSOLE_REPLAY_ALL). */ void nbcon_seq_force(struct console *con, u64 seq) { /* * If the specified record no longer exists, the oldest available record * is chosen. This is especially important on 32bit systems because only * the lower 32 bits of the sequence number are stored. The upper 32 bits * are derived from the sequence numbers available in the ringbuffer. */ u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); } /** * nbcon_seq_try_update - Try to update the console sequence number * @ctxt: Pointer to an acquire context that contains * all information about the acquire mode * @new_seq: The new sequence number to set * * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to * the 64bit value). This could be a different value than @new_seq if * nbcon_seq_force() was used or the current context no longer owns the * console. In the later case, it will stop printing anyway. */ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) { unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq); struct console *con = ctxt->console; if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq, __u64seq_to_ulseq(new_seq))) { ctxt->seq = new_seq; } else { ctxt->seq = nbcon_seq_read(con); } } /** * nbcon_context_try_acquire_direct - Try to acquire directly * @ctxt: The context of the caller * @cur: The current console state * * Acquire the console when it is released. Also acquire the console when * the current owner has a lower priority and the console is in a safe state. * * Return: 0 on success. Otherwise, an error code on failure. Also @cur * is updated to the latest state when failed to modify it. * * Errors: * * -EPERM: A panic is in progress and this is not the panic CPU. * Or the current owner or waiter has the same or higher * priority. No acquire method can be successful in * this case. * * -EBUSY: The current owner has a lower priority but the console * in an unsafe state. The caller should try using * the handover acquire method. */ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; do { /* * Panic does not imply that the console is owned. However, it * is critical that non-panic CPUs during panic are unable to * acquire ownership in order to satisfy the assumptions of * nbcon_waiter_matches(). In particular, the assumption that * lower priorities are ignored during panic. */ if (other_cpu_in_panic()) return -EPERM; if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) return -EPERM; if (cur->unsafe) return -EBUSY; /* * The console should never be safe for a direct acquire * if an unsafe hostile takeover has ever happened. */ WARN_ON_ONCE(cur->unsafe_takeover); new.atom = cur->atom; new.prio = ctxt->prio; new.req_prio = NBCON_PRIO_NONE; new.unsafe = cur->unsafe_takeover; new.cpu = cpu; } while (!nbcon_state_try_cmpxchg(con, cur, &new)); return 0; } static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) { /* * The request context is well defined by the @req_prio because: * * - Only a context with a priority higher than the owner can become * a waiter. * - Only a context with a priority higher than the waiter can * directly take over the request. * - There are only three priorities. * - Only one CPU is allowed to request PANIC priority. * - Lower priorities are ignored during panic() until reboot. * * As a result, the following scenario is *not* possible: * * 1. This context is currently a waiter. * 2. Another context with a higher priority than this context * directly takes ownership. * 3. The higher priority context releases the ownership. * 4. Another lower priority context takes the ownership. * 5. Another context with the same priority as this context * creates a request and starts waiting. * * Event #1 implies this context is EMERGENCY. * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. * Events #4 and #5 are not possible due to the other_cpu_in_panic() * check in nbcon_context_try_acquire_direct(). */ return (cur->req_prio == expected_prio); } /** * nbcon_context_try_acquire_requested - Try to acquire after having * requested a handover * @ctxt: The context of the caller * @cur: The current console state * * This is a helper function for nbcon_context_try_acquire_handover(). * It is called when the console is in an unsafe state. The current * owner will release the console on exit from the unsafe region. * * Return: 0 on success and @cur is updated to the new console state. * Otherwise an error code on failure. * * Errors: * * -EPERM: A panic is in progress and this is not the panic CPU * or this context is no longer the waiter. * * -EBUSY: The console is still locked. The caller should * continue waiting. * * Note: The caller must still remove the request when an error has occurred * except when this context is no longer the waiter. */ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; /* Note that the caller must still remove the request! */ if (other_cpu_in_panic()) return -EPERM; /* * Note that the waiter will also change if there was an unsafe * hostile takeover. */ if (!nbcon_waiter_matches(cur, ctxt->prio)) return -EPERM; /* If still locked, caller should continue waiting. */ if (cur->prio != NBCON_PRIO_NONE) return -EBUSY; /* * The previous owner should have never released ownership * in an unsafe region. */ WARN_ON_ONCE(cur->unsafe); new.atom = cur->atom; new.prio = ctxt->prio; new.req_prio = NBCON_PRIO_NONE; new.unsafe = cur->unsafe_takeover; new.cpu = cpu; if (!nbcon_state_try_cmpxchg(con, cur, &new)) { /* * The acquire could fail only when it has been taken * over by a higher priority context. */ WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio)); return -EPERM; } /* Handover success. This context now owns the console. */ return 0; } /** * nbcon_context_try_acquire_handover - Try to acquire via handover * @ctxt: The context of the caller * @cur: The current console state * * The function must be called only when the context has higher priority * than the current owner and the console is in an unsafe state. * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY. * * The function sets "req_prio" field to make the current owner aware of * the request. Then it waits until the current owner releases the console, * or an even higher context takes over the request, or timeout expires. * * The current owner checks the "req_prio" field on exit from the unsafe * region and releases the console. It does not touch the "req_prio" field * so that the console stays reserved for the waiter. * * Return: 0 on success. Otherwise, an error code on failure. Also @cur * is updated to the latest state when failed to modify it. * * Errors: * * -EPERM: A panic is in progress and this is not the panic CPU. * Or a higher priority context has taken over the * console or the handover request. * * -EBUSY: The current owner is on the same CPU so that the hand * shake could not work. Or the current owner is not * willing to wait (zero timeout). Or the console does * not enter the safe state before timeout passed. The * caller might still use the unsafe hostile takeover * when allowed. * * -EAGAIN: @cur has changed when creating the handover request. * The caller should retry with direct acquire. */ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; int timeout; int request_err = -EBUSY; /* * Check that the handover is called when the direct acquire failed * with -EBUSY. */ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(!cur->unsafe); /* Handover is not possible on the same CPU. */ if (cur->cpu == cpu) return -EBUSY; /* * Console stays unsafe after an unsafe takeover until re-initialized. * Waiting is not going to help in this case. */ if (cur->unsafe_takeover) return -EBUSY; /* Is the caller willing to wait? */ if (ctxt->spinwait_max_us == 0) return -EBUSY; /* * Setup a request for the handover. The caller should try to acquire * the console directly when the current state has been modified. */ new.atom = cur->atom; new.req_prio = ctxt->prio; if (!nbcon_state_try_cmpxchg(con, cur, &new)) return -EAGAIN; cur->atom = new.atom; /* Wait until there is no owner and then acquire the console. */ for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) { /* On successful acquire, this request is cleared. */ request_err = nbcon_context_try_acquire_requested(ctxt, cur); if (!request_err) return 0; /* * If the acquire should be aborted, it must be ensured * that the request is removed before returning to caller. */ if (request_err == -EPERM) break; udelay(1); /* Re-read the state because some time has passed. */ nbcon_state_read(con, cur); } /* Timed out or aborted. Carefully remove handover request. */ do { /* * No need to remove request if there is a new waiter. This * can only happen if a higher priority context has taken over * the console or the handover request. */ if (!nbcon_waiter_matches(cur, ctxt->prio)) return -EPERM; /* Unset request for handover. */ new.atom = cur->atom; new.req_prio = NBCON_PRIO_NONE; if (nbcon_state_try_cmpxchg(con, cur, &new)) { /* * Request successfully unset. Report failure of * acquiring via handover. */ cur->atom = new.atom; return request_err; } /* * Unable to remove request. Try to acquire in case * the owner has released the lock. */ } while (nbcon_context_try_acquire_requested(ctxt, cur)); /* Lucky timing. The acquire succeeded while removing the request. */ return 0; } /** * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover * @ctxt: The context of the caller * @cur: The current console state * * Acquire the console even in the unsafe state. * * It can be permitted by setting the 'allow_unsafe_takeover' field only * by the final attempt to flush messages in panic(). * * Return: 0 on success. -EPERM when not allowed by the context. */ static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; if (!ctxt->allow_unsafe_takeover) return -EPERM; /* Ensure caller is allowed to perform unsafe hostile takeovers. */ if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC)) return -EPERM; /* * Check that try_acquire_direct() and try_acquire_handover() returned * -EBUSY in the right situation. */ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(cur->unsafe != true); do { new.atom = cur->atom; new.cpu = cpu; new.prio = ctxt->prio; new.unsafe |= cur->unsafe_takeover; new.unsafe_takeover |= cur->unsafe; } while (!nbcon_state_try_cmpxchg(con, cur, &new)); return 0; } static struct printk_buffers panic_nbcon_pbufs; /** * nbcon_context_try_acquire - Try to acquire nbcon console * @ctxt: The context of the caller * * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. * * If the caller allowed an unsafe hostile takeover, on success the * caller should check the current console state to see if it is * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; int err; nbcon_state_read(con, &cur); try_again: err = nbcon_context_try_acquire_direct(ctxt, &cur); if (err != -EBUSY) goto out; err = nbcon_context_try_acquire_handover(ctxt, &cur); if (err == -EAGAIN) goto try_again; if (err != -EBUSY) goto out; err = nbcon_context_try_acquire_hostile(ctxt, &cur); out: if (err) return false; /* Acquire succeeded. */ /* Assign the appropriate buffer for this context. */ if (atomic_read(&panic_cpu) == cpu) ctxt->pbufs = &panic_nbcon_pbufs; else ctxt->pbufs = con->pbufs; /* Set the record sequence for this context to print. */ ctxt->seq = nbcon_seq_read(ctxt->console); return true; } static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu, int expected_prio) { /* * A similar function, nbcon_waiter_matches(), only deals with * EMERGENCY and PANIC priorities. However, this function must also * deal with the NORMAL priority, which requires additional checks * and constraints. * * For the case where preemption and interrupts are disabled, it is * enough to also verify that the owning CPU has not changed. * * For the case where preemption or interrupts are enabled, an * external synchronization method *must* be used. In particular, * the driver-specific locking mechanism used in device_lock() * (including disabling migration) should be used. It prevents * scenarios such as: * * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and * is scheduled out. * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY * and releases it. * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X] * and is scheduled out. * 4. [Task A] gets running on [CPU X] and sees that the console is * still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus * [Task A] thinks it is the owner when it is not. */ if (cur->prio != expected_prio) return false; if (cur->cpu != expected_cpu) return false; return true; } /** * nbcon_context_release - Release the console * @ctxt: The nbcon context from nbcon_context_try_acquire() */ static void nbcon_context_release(struct nbcon_context *ctxt) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; struct nbcon_state new; nbcon_state_read(con, &cur); do { if (!nbcon_owner_matches(&cur, cpu, ctxt->prio)) break; new.atom = cur.atom; new.prio = NBCON_PRIO_NONE; /* * If @unsafe_takeover is set, it is kept set so that * the state remains permanently unsafe. */ new.unsafe |= cur.unsafe_takeover; } while (!nbcon_state_try_cmpxchg(con, &cur, &new)); ctxt->pbufs = NULL; } /** * nbcon_context_can_proceed - Check whether ownership can proceed * @ctxt: The nbcon context from nbcon_context_try_acquire() * @cur: The current console state * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * Must be invoked when entering the unsafe state to make sure that it still * owns the lock. Also must be invoked when exiting the unsafe context * to eventually free the lock for a higher priority context which asked * for the friendly handover. * * It can be called inside an unsafe section when the console is just * temporary in safe state instead of exiting and entering the unsafe * state. * * Also it can be called in the safe context before doing an expensive * safe operation. It does not make sense to do the operation when * a higher priority context took the lock. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); /* Make sure this context still owns the console. */ if (!nbcon_owner_matches(cur, cpu, ctxt->prio)) return false; /* The console owner can proceed if there is no waiter. */ if (cur->req_prio == NBCON_PRIO_NONE) return true; /* * A console owner within an unsafe region is always allowed to * proceed, even if there are waiters. It can perform a handover * when exiting the unsafe region. Otherwise the waiter will * need to perform an unsafe hostile takeover. */ if (cur->unsafe) return true; /* Waiters always have higher priorities than owners. */ WARN_ON_ONCE(cur->req_prio <= cur->prio); /* * Having a safe point for take over and eventually a few * duplicated characters or a full line is way better than a * hostile takeover. Post processing can take care of the garbage. * Release and hand over. */ nbcon_context_release(ctxt); /* * It is not clear whether the waiter really took over ownership. The * outermost callsite must make the final decision whether console * ownership is needed for it to proceed. If yes, it must reacquire * ownership (possibly hostile) before carefully proceeding. * * The calling context no longer owns the console so go back all the * way instead of trying to implement reacquire heuristics in tons of * places. */ return false; } /** * nbcon_can_proceed - Check whether ownership can proceed * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * It is used in nbcon_enter_unsafe() to make sure that it still owns the * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock * for a higher priority context which asked for the friendly handover. * * It can be called inside an unsafe section when the console is just * temporary in safe state instead of exiting and entering the unsafe state. * * Also it can be called in the safe context before doing an expensive safe * operation. It does not make sense to do the operation when a higher * priority context took the lock. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; struct nbcon_state cur; nbcon_state_read(con, &cur); return nbcon_context_can_proceed(ctxt, &cur); } EXPORT_SYMBOL_GPL(nbcon_can_proceed); #define nbcon_context_enter_unsafe(c) __nbcon_context_update_unsafe(c, true) #define nbcon_context_exit_unsafe(c) __nbcon_context_update_unsafe(c, false) /** * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state * @ctxt: The nbcon context from nbcon_context_try_acquire() * @unsafe: The new value for the unsafe bit * * Return: True if the unsafe state was updated and this context still * owns the console. Otherwise false if ownership was handed * over or taken. * * This function allows console owners to modify the unsafe status of the * console. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. * * Internal helper to avoid duplicated code. */ static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe) { struct console *con = ctxt->console; struct nbcon_state cur; struct nbcon_state new; nbcon_state_read(con, &cur); do { /* * The unsafe bit must not be cleared if an * unsafe hostile takeover has occurred. */ if (!unsafe && cur.unsafe_takeover) goto out; if (!nbcon_context_can_proceed(ctxt, &cur)) return false; new.atom = cur.atom; new.unsafe = unsafe; } while (!nbcon_state_try_cmpxchg(con, &cur, &new)); cur.atom = new.atom; out: return nbcon_context_can_proceed(ctxt, &cur); } static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, char *buf, unsigned int len) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; struct nbcon_state cur; wctxt->outbuf = buf; wctxt->len = len; nbcon_state_read(con, &cur); wctxt->unsafe_takeover = cur.unsafe_takeover; } /** * nbcon_enter_unsafe - Enter an unsafe region in the driver * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); bool is_owner; is_owner = nbcon_context_enter_unsafe(ctxt); if (!is_owner) nbcon_write_context_set_buf(wctxt, NULL, 0); return is_owner; } EXPORT_SYMBOL_GPL(nbcon_enter_unsafe); /** * nbcon_exit_unsafe - Exit an unsafe region in the driver * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); bool ret; ret = nbcon_context_exit_unsafe(ctxt); if (!ret) nbcon_write_context_set_buf(wctxt, NULL, 0); return ret; } EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); /** * nbcon_reacquire_nobuf - Reacquire a console after losing ownership * while printing * @wctxt: The write context that was handed to the write callback * * Since ownership can be lost at any time due to handover or takeover, a * printing context _must_ be prepared to back out immediately and * carefully. However, there are scenarios where the printing context must * reacquire ownership in order to finalize or revert hardware changes. * * This function allows a printing context to reacquire ownership using the * same priority as its previous ownership. * * Note that after a successful reacquire the printing context will have no * output buffer because that has been lost. This function cannot be used to * resume printing. */ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); while (!nbcon_context_try_acquire(ctxt)) cpu_relax(); nbcon_write_context_set_buf(wctxt, NULL, 0); } EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf); /** * nbcon_emit_next_record - Emit a record in the acquired context * @wctxt: The write context that will be handed to the write function * @use_atomic: True if the write_atomic() callback is to be used * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. If the caller * wants to do more it must reacquire the console first. * * When true is returned, @wctxt->ctxt.backlog indicates whether there are * still records pending in the ringbuffer, */ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; struct printk_message pmsg = { .pbufs = ctxt->pbufs, }; unsigned long con_dropped; struct nbcon_state cur; unsigned long dropped; unsigned long ulseq; /* * This function should never be called for consoles that have not * implemented the necessary callback for writing: i.e. legacy * consoles and, when atomic, nbcon consoles with no write_atomic(). * Handle it as if ownership was lost and try to continue. * * Note that for nbcon consoles the write_thread() callback is * mandatory and was already checked in nbcon_alloc(). */ if (WARN_ON_ONCE((use_atomic && !con->write_atomic) || !(console_srcu_read_flags(con) & CON_NBCON))) { nbcon_context_release(ctxt); return false; } /* * The printk buffers are filled within an unsafe section. This * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from * clobbering each other. */ if (!nbcon_context_enter_unsafe(ctxt)) return false; ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true); if (!ctxt->backlog) return nbcon_context_exit_unsafe(ctxt); /* * @con->dropped is not protected in case of an unsafe hostile * takeover. In that situation the update can be racy so * annotate it accordingly. */ con_dropped = data_race(READ_ONCE(con->dropped)); dropped = con_dropped + pmsg.dropped; if (dropped && !is_extended) console_prepend_dropped(&pmsg, dropped); /* * If the previous owner was assigned the same record, this context * has taken over ownership and is replaying the record. Prepend a * message to let the user know the record is replayed. */ ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq)); if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) { console_prepend_replay(&pmsg); } else { /* * Ensure this context is still the owner before trying to * update @nbcon_prev_seq. Otherwise the value in @ulseq may * not be from the previous owner and instead be some later * value from the context that took over ownership. */ nbcon_state_read(con, &cur); if (!nbcon_context_can_proceed(ctxt, &cur)) return false; atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq, __u64seq_to_ulseq(pmsg.seq)); } if (!nbcon_context_exit_unsafe(ctxt)) return false; /* For skipped records just update seq/dropped in @con. */ if (pmsg.outbuf_len == 0) goto update_con; /* Initialize the write context for driver callbacks. */ nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len); if (use_atomic) con->write_atomic(con, wctxt); else con->write_thread(con, wctxt); if (!wctxt->outbuf) { /* * Ownership was lost and reacquired by the driver. Handle it * as if ownership was lost. */ nbcon_context_release(ctxt); return false; } /* * Ownership may have been lost but _not_ reacquired by the driver. * This case is detected and handled when entering unsafe to update * dropped/seq values. */ /* * Since any dropped message was successfully output, reset the * dropped count for the console. */ dropped = 0; update_con: /* * The dropped count and the sequence number are updated within an * unsafe section. This limits update races to the panic context and * allows the panic context to win. */ if (!nbcon_context_enter_unsafe(ctxt)) return false; if (dropped != con_dropped) { /* Counterpart to the READ_ONCE() above. */ WRITE_ONCE(con->dropped, dropped); } nbcon_seq_try_update(ctxt, pmsg.seq + 1); return nbcon_context_exit_unsafe(ctxt); } /* * nbcon_emit_one - Print one record for an nbcon console using the * specified callback * @wctxt: An initialized write context struct to use for this context * @use_atomic: True if the write_atomic() callback is to be used * * Return: True, when a record has been printed and there are still * pending records. The caller might want to continue flushing. * * False, when there is no pending record, or when the console * context cannot be acquired, or the ownership has been lost. * The caller should give up. Either the job is done, cannot be * done, or will be handled by the owning context. * * This is an internal helper to handle the locking of the console before * calling nbcon_emit_next_record(). */ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; unsigned long flags; bool ret = false; if (!use_atomic) { con->device_lock(con, &flags); /* * Ensure this stays on the CPU to make handover and * takeover possible. */ cant_migrate(); } if (!nbcon_context_try_acquire(ctxt)) goto out; /* * nbcon_emit_next_record() returns false when the console was * handed over or taken over. In both cases the context is no * longer valid. * * The higher priority printing context takes over responsibility * to print the pending records. */ if (!nbcon_emit_next_record(wctxt, use_atomic)) goto out; nbcon_context_release(ctxt); ret = ctxt->backlog; out: if (!use_atomic) con->device_unlock(con, flags); return ret; } /** * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup * @con: Console to operate on * @ctxt: The nbcon context from nbcon_context_try_acquire() * * Return: True if the thread should shutdown or if the console is * allowed to print and a record is available. False otherwise. * * After the thread wakes up, it must first check if it should shutdown before * attempting any printing. */ static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt) { bool ret = false; short flags; int cookie; if (kthread_should_stop()) return true; cookie = console_srcu_read_lock(); flags = console_srcu_read_flags(con); if (console_is_usable(con, flags, false)) { /* Bring the sequence in @ctxt up to date */ ctxt->seq = nbcon_seq_read(con); ret = prb_read_valid(prb, ctxt->seq, NULL); } console_srcu_read_unlock(cookie); return ret; } /** * nbcon_kthread_func - The printer thread function * @__console: Console to operate on * * Return: 0 */ static int nbcon_kthread_func(void *__console) { struct console *con = __console; struct nbcon_write_context wctxt = { .ctxt.console = con, .ctxt.prio = NBCON_PRIO_NORMAL, }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); short con_flags; bool backlog; int cookie; wait_for_event: /* * Guarantee this task is visible on the rcuwait before * checking the wake condition. * * The full memory barrier within set_current_state() of * ___rcuwait_wait_event() pairs with the full memory * barrier within rcuwait_has_sleeper(). * * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A. */ rcuwait_wait_event(&con->rcuwait, nbcon_kthread_should_wakeup(con, ctxt), TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */ do { if (kthread_should_stop()) return 0; backlog = false; /* * Keep the srcu read lock around the entire operation so that * synchronize_srcu() can guarantee that the kthread stopped * or suspended printing. */ cookie = console_srcu_read_lock(); con_flags = console_srcu_read_flags(con); if (console_is_usable(con, con_flags, false)) backlog = nbcon_emit_one(&wctxt, false); console_srcu_read_unlock(cookie); cond_resched(); } while (backlog); goto wait_for_event; } /** * nbcon_irq_work - irq work to wake console printer thread * @irq_work: The irq work to operate on */ static void nbcon_irq_work(struct irq_work *irq_work) { struct console *con = container_of(irq_work, struct console, irq_work); nbcon_kthread_wake(con); } static inline bool rcuwait_has_sleeper(struct rcuwait *w) { /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the rcuwait is empty. * * This full memory barrier pairs with the full memory barrier within * set_current_state() of ___rcuwait_wait_event(), which is called * after prepare_to_rcuwait() adds the waiter but before it has * checked the wait condition. * * This pairs with nbcon_kthread_func:A. */ smp_mb(); /* LMM(rcuwait_has_sleeper:A) */ return rcuwait_active(w); } /** * nbcon_kthreads_wake - Wake up printing threads using irq_work */ void nbcon_kthreads_wake(void) { struct console *con; int cookie; if (!printk_kthreads_running) return; cookie = console_srcu_read_lock(); for_each_console_srcu(con) { if (!(console_srcu_read_flags(con) & CON_NBCON)) continue; /* * Only schedule irq_work if the printing thread is * actively waiting. If not waiting, the thread will * notice by itself that it has work to do. */ if (rcuwait_has_sleeper(&con->rcuwait)) irq_work_queue(&con->irq_work); } console_srcu_read_unlock(cookie); } /* * nbcon_kthread_stop - Stop a console printer thread * @con: Console to operate on */ void nbcon_kthread_stop(struct console *con) { lockdep_assert_console_list_lock_held(); if (!con->kthread) return; kthread_stop(con->kthread); con->kthread = NULL; } /** * nbcon_kthread_create - Create a console printer thread * @con: Console to operate on * * Return: True if the kthread was started or already exists. * Otherwise false and @con must not be registered. * * This function is called when it will be expected that nbcon consoles are * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL * will be no longer flushed by the legacy loop. This is why failure must * be fatal for console registration. * * If @con was already registered and this function fails, @con must be * unregistered before the global state variable @printk_kthreads_running * can be set. */ bool nbcon_kthread_create(struct console *con) { struct task_struct *kt; lockdep_assert_console_list_lock_held(); if (con->kthread) return true; kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index); if (WARN_ON(IS_ERR(kt))) { con_printk(KERN_ERR, con, "failed to start printing thread\n"); return false; } con->kthread = kt; /* * It is important that console printing threads are scheduled * shortly after a printk call and with generous runtime budgets. */ sched_set_normal(con->kthread, -20); return true; } /* Track the nbcon emergency nesting per CPU. */ static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting); static unsigned int early_nbcon_pcpu_emergency_nesting __initdata; /** * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer * * Context: For reading, any context. For writing, any context which could * not be migrated to another CPU. * Return: Either a pointer to the per CPU emergency nesting counter of * the current CPU or to the init data during early boot. * * The function is safe for reading per-CPU variables in any context because * preemption is disabled if the current CPU is in the emergency state. See * also nbcon_cpu_emergency_enter(). */ static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void) { /* * The value of __printk_percpu_data_ready gets set in normal * context and before SMP initialization. As a result it could * never change while inside an nbcon emergency section. */ if (!printk_percpu_data_ready()) return &early_nbcon_pcpu_emergency_nesting; return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting); } /** * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon * printing on the current CPU * * Context: Any context. * Return: The nbcon_prio to use for acquiring an nbcon console in this * context for printing. * * The function is safe for reading per-CPU data in any context because * preemption is disabled if the current CPU is in the emergency or panic * state. */ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; if (this_cpu_in_panic()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); if (*cpu_emergency_nesting) return NBCON_PRIO_EMERGENCY; return NBCON_PRIO_NORMAL; } /** * nbcon_legacy_emit_next_record - Print one record for an nbcon console * in legacy contexts * @con: The console to print on * @handover: Will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding * both the console_lock and the SRCU read lock. Otherwise it * is set to false. * @cookie: The cookie from the SRCU read lock. * @use_atomic: Set true when called in an atomic or unknown context. * It affects which nbcon callback will be used: write_atomic() * or write_thread(). * * When false, the write_thread() callback is used and would be * called in a preemtible context unless disabled by the * device_lock. The legacy handover is not allowed in this mode. * * Context: Any context except NMI. * Return: True, when a record has been printed and there are still * pending records. The caller might want to continue flushing. * * False, when there is no pending record, or when the console * context cannot be acquired, or the ownership has been lost. * The caller should give up. Either the job is done, cannot be * done, or will be handled by the owning context. * * This function is meant to be called by console_flush_all() to print records * on nbcon consoles from legacy context (printing via console unlocking). * Essentially it is the nbcon version of console_emit_next_record(). */ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, int cookie, bool use_atomic) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); unsigned long flags; bool progress; ctxt->console = con; ctxt->prio = nbcon_get_default_prio(); if (use_atomic) { /* * In an atomic or unknown context, use the same procedure as * in console_emit_next_record(). It allows to handover. */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); stop_critical_timings(); } progress = nbcon_emit_one(&wctxt, use_atomic); if (use_atomic) { start_critical_timings(); *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); } else { /* Non-atomic does not perform legacy spinning handovers. */ *handover = false; } return progress; } /** * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on * failure. * * Errors: * * -EPERM: Unable to acquire console ownership. * * -EAGAIN: Another context took over ownership while printing. * * -ENOENT: A record before @stop_seq is not available. * * If flushing up to @stop_seq was not successful, it only makes sense for the * caller to try again when -EAGAIN was returned. When -EPERM is returned, * this context is not allowed to acquire the console. When -ENOENT is * returned, it cannot be expected that the unfinalized record will become * available. */ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, bool allow_unsafe_takeover) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); int err = 0; ctxt->console = con; ctxt->spinwait_max_us = 2000; ctxt->prio = nbcon_get_default_prio(); ctxt->allow_unsafe_takeover = allow_unsafe_takeover; if (!nbcon_context_try_acquire(ctxt)) return -EPERM; while (nbcon_seq_read(con) < stop_seq) { /* * nbcon_emit_next_record() returns false when the console was * handed over or taken over. In both cases the context is no * longer valid. */ if (!nbcon_emit_next_record(&wctxt, true)) return -EAGAIN; if (!ctxt->backlog) { /* Are there reserved but not yet finalized records? */ if (nbcon_seq_read(con) < stop_seq) err = -ENOENT; break; } } nbcon_context_release(ctxt); return err; } /** * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * This will stop flushing before @stop_seq if another context has ownership. * That context is then responsible for the flushing. Likewise, if new records * are added while this context was flushing and there is no other context * to handle the printing, this context must also flush those records. */ static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, bool allow_unsafe_takeover) { struct console_flush_type ft; unsigned long flags; int err; again: /* * Atomic flushing does not use console driver synchronization (i.e. * it does not hold the port lock for uart consoles). Therefore IRQs * must be disabled to avoid being interrupted and then calling into * a driver that will deadlock trying to acquire console ownership. */ local_irq_save(flags); err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); local_irq_restore(flags); /* * If there was a new owner (-EPERM, -EAGAIN), that context is * responsible for completing. * * Do not wait for records not yet finalized (-ENOENT) to avoid a * possible deadlock. They will either get flushed by the writer or * eventually skipped on panic CPU. */ if (err) return; /* * If flushing was successful but more records are available, this * context must flush those remaining records if the printer thread * is not available do it. */ printk_get_console_flush_type(&ft); if (!ft.nbcon_offload && prb_read_valid(prb, nbcon_seq_read(con), NULL)) { stop_seq = prb_next_reserve_seq(prb); goto again; } } /** * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * @stop_seq: Flush up until this record * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers */ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover) { struct console *con; int cookie; cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); if (!(flags & CON_NBCON)) continue; if (!console_is_usable(con, flags, true)) continue; if (nbcon_seq_read(con) >= stop_seq) continue; nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); } console_srcu_read_unlock(cookie); } /** * nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * * Flush the backlog up through the currently newest record. Any new * records added while flushing will not be flushed if there is another * context available to handle the flushing. This is to avoid one CPU * printing unbounded because other CPUs continue to add records. */ void nbcon_atomic_flush_pending(void) { __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false); } /** * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their * write_atomic() callback and allowing unsafe hostile takeovers * * Flush the backlog up through the currently newest record. Unsafe hostile * takeovers will be performed, if necessary. */ void nbcon_atomic_flush_unsafe(void) { __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true); } /** * nbcon_cpu_emergency_enter - Enter an emergency section where printk() * messages for that CPU are flushed directly * * Context: Any context. Disables preemption. * * When within an emergency section, printk() calls will attempt to flush any * pending messages in the ringbuffer. */ void nbcon_cpu_emergency_enter(void) { unsigned int *cpu_emergency_nesting; preempt_disable(); cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); (*cpu_emergency_nesting)++; } /** * nbcon_cpu_emergency_exit - Exit an emergency section * * Context: Within an emergency section. Enables preemption. */ void nbcon_cpu_emergency_exit(void) { unsigned int *cpu_emergency_nesting; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) (*cpu_emergency_nesting)--; preempt_enable(); } /** * nbcon_alloc - Allocate and init the nbcon console specific data * @con: Console to initialize * * Return: True if the console was fully allocated and initialized. * Otherwise @con must not be registered. * * When allocation and init was successful, the console must be properly * freed using nbcon_free() once it is no longer needed. */ bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; /* The write_thread() callback is mandatory. */ if (WARN_ON(!con->write_thread)) return false; rcuwait_init(&con->rcuwait); init_irq_work(&con->irq_work, nbcon_irq_work); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL); nbcon_state_set(con, &state); /* * Initialize @nbcon_seq to the highest possible sequence number so * that practically speaking it will have nothing to print until a * desired initial sequence number has been set via nbcon_seq_force(). */ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb)); if (con->flags & CON_BOOT) { /* * Boot console printing is synchronized with legacy console * printing, so boot consoles can share the same global printk * buffers. */ con->pbufs = &printk_shared_pbufs; } else { con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL); if (!con->pbufs) { con_printk(KERN_ERR, con, "failed to allocate printing buffer\n"); return false; } if (printk_kthreads_running) { if (!nbcon_kthread_create(con)) { kfree(con->pbufs); con->pbufs = NULL; return false; } } } return true; } /** * nbcon_free - Free and cleanup the nbcon console specific data * @con: Console to free/cleanup nbcon data */ void nbcon_free(struct console *con) { struct nbcon_state state = { }; if (printk_kthreads_running) nbcon_kthread_stop(con); nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ if (!(con->flags & CON_BOOT)) kfree(con->pbufs); con->pbufs = NULL; } /** * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe * section * @con: The nbcon console to acquire * * Context: Under the locking mechanism implemented in * @con->device_lock() including disabling migration. * Return: True if the console was acquired. False otherwise. * * Console drivers will usually use their own internal synchronization * mechasism to synchronize between console printing and non-printing * activities (such as setting baud rates). However, nbcon console drivers * supporting atomic consoles may also want to mark unsafe sections when * performing non-printing activities in order to synchronize against their * atomic_write() callback. * * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL * and marks it unsafe for handover/takeover. */ bool nbcon_device_try_acquire(struct console *con) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); cant_migrate(); memset(ctxt, 0, sizeof(*ctxt)); ctxt->console = con; ctxt->prio = NBCON_PRIO_NORMAL; if (!nbcon_context_try_acquire(ctxt)) return false; if (!nbcon_context_enter_unsafe(ctxt)) return false; return true; } EXPORT_SYMBOL_GPL(nbcon_device_try_acquire); /** * nbcon_device_release - Exit unsafe section and release the nbcon console * @con: The nbcon console acquired in nbcon_device_try_acquire() */ void nbcon_device_release(struct console *con) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); struct console_flush_type ft; int cookie; if (!nbcon_context_exit_unsafe(ctxt)) return; nbcon_context_release(ctxt); /* * This context must flush any new records added while the console * was locked if the printer thread is not available to do it. The * console_srcu_read_lock must be taken to ensure the console is * usable throughout flushing. */ cookie = console_srcu_read_lock(); printk_get_console_flush_type(&ft); if (console_is_usable(con, console_srcu_read_flags(con), true) && !ft.nbcon_offload && prb_read_valid(prb, nbcon_seq_read(con), NULL)) { /* * If nbcon_atomic flushing is not available, fallback to * using the legacy loop. */ if (ft.nbcon_atomic) { __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false); } else if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } else if (ft.legacy_offload) { printk_trigger_flush(); } } console_srcu_read_unlock(cookie); } EXPORT_SYMBOL_GPL(nbcon_device_release);
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct stats_req_info { struct ethnl_req_info base; DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT); enum ethtool_mac_stats_src src; }; #define STATS_REQINFO(__req_base) \ container_of(__req_base, struct stats_req_info, base) struct stats_reply_data { struct ethnl_reply_data base; struct_group(stats, struct ethtool_eth_phy_stats phy_stats; struct ethtool_eth_mac_stats mac_stats; struct ethtool_eth_ctrl_stats ctrl_stats; struct ethtool_rmon_stats rmon_stats; ); const struct ethtool_rmon_hist_range *rmon_ranges; }; #define STATS_REPDATA(__reply_base) \ container_of(__reply_base, struct stats_reply_data, base) const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_STATS_ETH_PHY] = "eth-phy", [ETHTOOL_STATS_ETH_MAC] = "eth-mac", [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl", [ETHTOOL_STATS_RMON] = "rmon", }; const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR] = "SymbolErrorDuringCarrier", }; const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT] = "FramesTransmittedOK", [ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL] = "SingleCollisionFrames", [ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL] = "MultipleCollisionFrames", [ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT] = "FramesReceivedOK", [ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR] = "FrameCheckSequenceErrors", [ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR] = "AlignmentErrors", [ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES] = "OctetsTransmittedOK", [ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER] = "FramesWithDeferredXmissions", [ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL] = "LateCollisions", [ETHTOOL_A_STATS_ETH_MAC_11_XS_COL] = "FramesAbortedDueToXSColls", [ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR] = "FramesLostDueToIntMACXmitError", [ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR] = "CarrierSenseErrors", [ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES] = "OctetsReceivedOK", [ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR] = "FramesLostDueToIntMACRcvError", [ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST] = "MulticastFramesXmittedOK", [ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST] = "BroadcastFramesXmittedOK", [ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER] = "FramesWithExcessiveDeferral", [ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST] = "MulticastFramesReceivedOK", [ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST] = "BroadcastFramesReceivedOK", [ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR] = "InRangeLengthErrors", [ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN] = "OutOfRangeLengthField", [ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR] = "FrameTooLongErrors", }; const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_ETH_CTRL_3_TX] = "MACControlFramesTransmitted", [ETHTOOL_A_STATS_ETH_CTRL_4_RX] = "MACControlFramesReceived", [ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP] = "UnsupportedOpcodesReceived", }; const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_RMON_UNDERSIZE] = "etherStatsUndersizePkts", [ETHTOOL_A_STATS_RMON_OVERSIZE] = "etherStatsOversizePkts", [ETHTOOL_A_STATS_RMON_FRAG] = "etherStatsFragments", [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers", }; const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_STATS_GROUPS] = { .type = NLA_NESTED }, [ETHTOOL_A_STATS_SRC] = NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int stats_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct stats_req_info *req_info = STATS_REQINFO(req_base); bool mod = false; int err; err = ethnl_update_bitset(req_info->stat_mask, __ETHTOOL_STATS_CNT, tb[ETHTOOL_A_STATS_GROUPS], stats_std_names, extack, &mod); if (err) return err; if (!mod) { NL_SET_ERR_MSG(extack, "no stats requested"); return -EINVAL; } if (tb[ETHTOOL_A_STATS_SRC]) src = nla_get_u32(tb[ETHTOOL_A_STATS_SRC]); req_info->src = src; return 0; } static int stats_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct stats_req_info *req_info = STATS_REQINFO(req_base); struct stats_reply_data *data = STATS_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; } /* Mark all stats as unset (see ETHTOOL_STAT_NOT_SET) to prevent them * from being reported to user space in case driver did not set them. */ memset(&data->stats, 0xff, sizeof(data->stats)); data->phy_stats.src = src; data->mac_stats.src = src; data->ctrl_stats.src = src; data->rmon_stats.src = src; if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) && dev->ethtool_ops->get_eth_mac_stats) dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats); if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) && dev->ethtool_ops->get_eth_ctrl_stats) dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats); if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask) && dev->ethtool_ops->get_rmon_stats) dev->ethtool_ops->get_rmon_stats(dev, &data->rmon_stats, &data->rmon_ranges); ethnl_ops_complete(dev); return 0; } static int stats_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct stats_req_info *req_info = STATS_REQINFO(req_base); unsigned int n_grps = 0, n_stats = 0; int len = 0; len += nla_total_size(sizeof(u32)); /* _STATS_SRC */ if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) { n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64); n_grps++; } if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) { n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64); n_grps++; } if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) { n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64); n_grps++; } if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) { n_stats += sizeof(struct ethtool_rmon_stats) / sizeof(u64); n_grps++; /* Above includes the space for _A_STATS_GRP_HIST_VALs */ len += (nla_total_size(0) + /* _A_STATS_GRP_HIST */ nla_total_size(4) + /* _A_STATS_GRP_HIST_BKT_LOW */ nla_total_size(4)) * /* _A_STATS_GRP_HIST_BKT_HI */ ETHTOOL_RMON_HIST_MAX * 2; } len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ nla_total_size(4) + /* _A_STATS_GRP_ID */ nla_total_size(4)); /* _A_STATS_GRP_SS_ID */ len += n_stats * (nla_total_size(0) + /* _A_STATS_GRP_STAT */ nla_total_size_64bit(sizeof(u64))); return len; } static int stat_put(struct sk_buff *skb, u16 attrtype, u64 val) { struct nlattr *nest; int ret; if (val == ETHTOOL_STAT_NOT_SET) return 0; /* We want to start stats attr types from 0, so we don't have a type * for pad inside ETHTOOL_A_STATS_GRP_STAT. Pad things on the outside * of ETHTOOL_A_STATS_GRP_STAT. Since we're one nest away from the * actual attr we're 4B off - nla_need_padding_for_64bit() & co. * can't be used. */ #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS if (!IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) if (!nla_reserve(skb, ETHTOOL_A_STATS_GRP_PAD, 0)) return -EMSGSIZE; #endif nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP_STAT); if (!nest) return -EMSGSIZE; ret = nla_put_u64_64bit(skb, attrtype, val, -1 /* not used */); if (ret) { nla_nest_cancel(skb, nest); return ret; } nla_nest_end(skb, nest); return 0; } static int stats_put_phy_stats(struct sk_buff *skb, const struct stats_reply_data *data) { if (stat_put(skb, ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR, data->phy_stats.SymbolErrorDuringCarrier)) return -EMSGSIZE; return 0; } static int stats_put_mac_stats(struct sk_buff *skb, const struct stats_reply_data *data) { if (stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT, data->mac_stats.FramesTransmittedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL, data->mac_stats.SingleCollisionFrames) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL, data->mac_stats.MultipleCollisionFrames) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT, data->mac_stats.FramesReceivedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR, data->mac_stats.FrameCheckSequenceErrors) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR, data->mac_stats.AlignmentErrors) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES, data->mac_stats.OctetsTransmittedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER, data->mac_stats.FramesWithDeferredXmissions) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL, data->mac_stats.LateCollisions) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_11_XS_COL, data->mac_stats.FramesAbortedDueToXSColls) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR, data->mac_stats.FramesLostDueToIntMACXmitError) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR, data->mac_stats.CarrierSenseErrors) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES, data->mac_stats.OctetsReceivedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR, data->mac_stats.FramesLostDueToIntMACRcvError) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST, data->mac_stats.MulticastFramesXmittedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST, data->mac_stats.BroadcastFramesXmittedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER, data->mac_stats.FramesWithExcessiveDeferral) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST, data->mac_stats.MulticastFramesReceivedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST, data->mac_stats.BroadcastFramesReceivedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR, data->mac_stats.InRangeLengthErrors) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN, data->mac_stats.OutOfRangeLengthField) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR, data->mac_stats.FrameTooLongErrors)) return -EMSGSIZE; return 0; } static int stats_put_ctrl_stats(struct sk_buff *skb, const struct stats_reply_data *data) { if (stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_3_TX, data->ctrl_stats.MACControlFramesTransmitted) || stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_4_RX, data->ctrl_stats.MACControlFramesReceived) || stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP, data->ctrl_stats.UnsupportedOpcodesReceived)) return -EMSGSIZE; return 0; } static int stats_put_rmon_hist(struct sk_buff *skb, u32 attr, const u64 *hist, const struct ethtool_rmon_hist_range *ranges) { struct nlattr *nest; int i; if (!ranges) return 0; for (i = 0; i < ETHTOOL_RMON_HIST_MAX; i++) { if (!ranges[i].low && !ranges[i].high) break; if (hist[i] == ETHTOOL_STAT_NOT_SET) continue; nest = nla_nest_start(skb, attr); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, ranges[i].low) || nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_HI, ranges[i].high) || nla_put_u64_64bit(skb, ETHTOOL_A_STATS_GRP_HIST_VAL, hist[i], ETHTOOL_A_STATS_GRP_PAD)) goto err_cancel_hist; nla_nest_end(skb, nest); } return 0; err_cancel_hist: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int stats_put_rmon_stats(struct sk_buff *skb, const struct stats_reply_data *data) { if (stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_RX, data->rmon_stats.hist, data->rmon_ranges) || stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_TX, data->rmon_stats.hist_tx, data->rmon_ranges)) return -EMSGSIZE; if (stat_put(skb, ETHTOOL_A_STATS_RMON_UNDERSIZE, data->rmon_stats.undersize_pkts) || stat_put(skb, ETHTOOL_A_STATS_RMON_OVERSIZE, data->rmon_stats.oversize_pkts) || stat_put(skb, ETHTOOL_A_STATS_RMON_FRAG, data->rmon_stats.fragments) || stat_put(skb, ETHTOOL_A_STATS_RMON_JABBER, data->rmon_stats.jabbers)) return -EMSGSIZE; return 0; } static int stats_put_stats(struct sk_buff *skb, const struct stats_reply_data *data, u32 id, u32 ss_id, int (*cb)(struct sk_buff *skb, const struct stats_reply_data *data)) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_ID, id) || nla_put_u32(skb, ETHTOOL_A_STATS_GRP_SS_ID, ss_id)) goto err_cancel; if (cb(skb, data)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int stats_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct stats_req_info *req_info = STATS_REQINFO(req_base); const struct stats_reply_data *data = STATS_REPDATA(reply_base); int ret = 0; if (nla_put_u32(skb, ETHTOOL_A_STATS_SRC, req_info->src)) return -EMSGSIZE; if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY, ETH_SS_STATS_ETH_PHY, stats_put_phy_stats); if (!ret && test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC, ETH_SS_STATS_ETH_MAC, stats_put_mac_stats); if (!ret && test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL, ETH_SS_STATS_ETH_CTRL, stats_put_ctrl_stats); if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON, ETH_SS_STATS_RMON, stats_put_rmon_stats); return ret; } const struct ethnl_request_ops ethnl_stats_request_ops = { .request_cmd = ETHTOOL_MSG_STATS_GET, .reply_cmd = ETHTOOL_MSG_STATS_GET_REPLY, .hdr_attr = ETHTOOL_A_STATS_HEADER, .req_info_size = sizeof(struct stats_req_info), .reply_data_size = sizeof(struct stats_reply_data), .parse_request = stats_parse_request, .prepare_data = stats_prepare_data, .reply_size = stats_reply_size, .fill_reply = stats_fill_reply, }; static u64 ethtool_stats_sum(u64 a, u64 b) { if (a == ETHTOOL_STAT_NOT_SET) return b; if (b == ETHTOOL_STAT_NOT_SET) return a; return a + b; } /* Avoid modifying the aggregation procedure every time a new counter is added * by treating the structures as an array of u64 statistics. */ static void ethtool_aggregate_stats(void *aggr_stats, const void *emac_stats, const void *pmac_stats, size_t stats_size, size_t stats_offset) { size_t num_stats = stats_size / sizeof(u64); const u64 *s1 = emac_stats + stats_offset; const u64 *s2 = pmac_stats + stats_offset; u64 *s = aggr_stats + stats_offset; int i; for (i = 0; i < num_stats; i++) s[i] = ethtool_stats_sum(s1[i], s2[i]); } void ethtool_aggregate_mac_stats(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_eth_mac_stats pmac, emac; memset(&emac, 0xff, sizeof(emac)); memset(&pmac, 0xff, sizeof(pmac)); emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; ops->get_eth_mac_stats(dev, &emac); ops->get_eth_mac_stats(dev, &pmac); ethtool_aggregate_stats(mac_stats, &emac, &pmac, sizeof(mac_stats->stats), offsetof(struct ethtool_eth_mac_stats, stats)); } EXPORT_SYMBOL(ethtool_aggregate_mac_stats); void ethtool_aggregate_phy_stats(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_eth_phy_stats pmac, emac; memset(&emac, 0xff, sizeof(emac)); memset(&pmac, 0xff, sizeof(pmac)); emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; ops->get_eth_phy_stats(dev, &emac); ops->get_eth_phy_stats(dev, &pmac); ethtool_aggregate_stats(phy_stats, &emac, &pmac, sizeof(phy_stats->stats), offsetof(struct ethtool_eth_phy_stats, stats)); } EXPORT_SYMBOL(ethtool_aggregate_phy_stats); void ethtool_aggregate_ctrl_stats(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_eth_ctrl_stats pmac, emac; memset(&emac, 0xff, sizeof(emac)); memset(&pmac, 0xff, sizeof(pmac)); emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; ops->get_eth_ctrl_stats(dev, &emac); ops->get_eth_ctrl_stats(dev, &pmac); ethtool_aggregate_stats(ctrl_stats, &emac, &pmac, sizeof(ctrl_stats->stats), offsetof(struct ethtool_eth_ctrl_stats, stats)); } EXPORT_SYMBOL(ethtool_aggregate_ctrl_stats); void ethtool_aggregate_pause_stats(struct net_device *dev, struct ethtool_pause_stats *pause_stats) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_pause_stats pmac, emac; memset(&emac, 0xff, sizeof(emac)); memset(&pmac, 0xff, sizeof(pmac)); emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; ops->get_pause_stats(dev, &emac); ops->get_pause_stats(dev, &pmac); ethtool_aggregate_stats(pause_stats, &emac, &pmac, sizeof(pause_stats->stats), offsetof(struct ethtool_pause_stats, stats)); } EXPORT_SYMBOL(ethtool_aggregate_pause_stats); void ethtool_aggregate_rmon_stats(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats) { const struct ethtool_ops *ops = dev->ethtool_ops; const struct ethtool_rmon_hist_range *dummy; struct ethtool_rmon_stats pmac, emac; memset(&emac, 0xff, sizeof(emac)); memset(&pmac, 0xff, sizeof(pmac)); emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; ops->get_rmon_stats(dev, &emac, &dummy); ops->get_rmon_stats(dev, &pmac, &dummy); ethtool_aggregate_stats(rmon_stats, &emac, &pmac, sizeof(rmon_stats->stats), offsetof(struct ethtool_rmon_stats, stats)); } EXPORT_SYMBOL(ethtool_aggregate_rmon_stats);
218 214 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COOKIE_H #define __LINUX_COOKIE_H #include <linux/atomic.h> #include <linux/percpu.h> #include <asm/local.h> struct pcpu_gen_cookie { local_t nesting; u64 last; } __aligned(16); struct gen_cookie { struct pcpu_gen_cookie __percpu *local; atomic64_t forward_last ____cacheline_aligned_in_smp; atomic64_t reverse_last; }; #define COOKIE_LOCAL_BATCH 4096 #define DEFINE_COOKIE(name) \ static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name); \ static struct gen_cookie name = { \ .local = &__##name, \ .forward_last = ATOMIC64_INIT(0), \ .reverse_last = ATOMIC64_INIT(0), \ } static __always_inline u64 gen_cookie_next(struct gen_cookie *gc) { struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local); u64 val; if (likely(local_inc_return(&local->nesting) == 1)) { val = local->last; if (__is_defined(CONFIG_SMP) && unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) { s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH, &gc->forward_last); val = next - COOKIE_LOCAL_BATCH; } local->last = ++val; } else { val = atomic64_dec_return(&gc->reverse_last); } local_dec(&local->nesting); return val; } #endif /* __LINUX_COOKIE_H */
550 549 549 424 549 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/symlink.c - kernfs symlink implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/gfp.h> #include <linux/namei.h> #include "kernfs-internal.h" /** * kernfs_create_link - create a symlink * @parent: directory to create the symlink in * @name: name of the symlink * @target: target node for the symlink to point to * * Return: the created node on success, ERR_PTR() value on error. * Ownership of the link matches ownership of the target. */ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { struct kernfs_node *kn; int error; kuid_t uid = GLOBAL_ROOT_UID; kgid_t gid = GLOBAL_ROOT_GID; if (target->iattr) { uid = target->iattr->ia_uid; gid = target->iattr->ia_gid; } kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK); if (!kn) return ERR_PTR(-ENOMEM); if (kernfs_ns_enabled(parent)) kn->ns = target->ns; kn->symlink.target_kn = target; kernfs_get(target); /* ref owned by symlink */ error = kernfs_add_one(kn); if (!error) return kn; kernfs_put(kn); return ERR_PTR(error); } static int kernfs_get_target_path(struct kernfs_node *parent, struct kernfs_node *target, char *path) { struct kernfs_node *base, *kn; char *s = path; int len = 0; /* go up to the root, stop at the base */ base = parent; while (base->parent) { kn = target->parent; while (kn->parent && base != kn) kn = kn->parent; if (base == kn) break; if ((s - path) + 3 >= PATH_MAX) return -ENAMETOOLONG; strcpy(s, "../"); s += 3; base = base->parent; } /* determine end of target string for reverse fillup */ kn = target; while (kn->parent && kn != base) { len += strlen(kn->name) + 1; kn = kn->parent; } /* check limits */ if (len < 2) return -EINVAL; len--; if ((s - path) + len >= PATH_MAX) return -ENAMETOOLONG; /* reverse fillup of target string from target to base */ kn = target; while (kn->parent && kn != base) { int slen = strlen(kn->name); len -= slen; memcpy(s + len, kn->name, slen); if (len) s[--len] = '/'; kn = kn->parent; } return 0; } static int kernfs_getlink(struct inode *inode, char *path) { struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; struct kernfs_root *root = kernfs_root(parent); int error; down_read(&root->kernfs_rwsem); error = kernfs_get_target_path(parent, target, path); up_read(&root->kernfs_rwsem); return error; } static const char *kernfs_iop_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { char *body; int error; if (!dentry) return ERR_PTR(-ECHILD); body = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!body) return ERR_PTR(-ENOMEM); error = kernfs_getlink(inode, body); if (unlikely(error < 0)) { kfree(body); return ERR_PTR(error); } set_delayed_call(done, kfree_link, body); return body; } const struct inode_operations kernfs_symlink_iops = { .listxattr = kernfs_iop_listxattr, .get_link = kernfs_iop_get_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .permission = kernfs_iop_permission, };
46 46 46 46 46 46 46 77 77 77 15 3 12 12 12 46 46 46 46 46 27 134 92 46 27 464 464 464 7 7 5 5 31 8 10 3 19 26 12 12 9 3 12 46 7 1 1 2 2 1 4 4 4 14 14 14 14 5 4 1 5 8 4 4 4 4 3 3 3 3 3 52 52 57 58 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/workqueue.h> #include <linux/rtnetlink.h> #include <linux/cache.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/delay.h> #include <linux/sched.h> #include <linux/idr.h> #include <linux/rculist.h> #include <linux/nsproxy.h> #include <linux/fs.h> #include <linux/proc_ns.h> #include <linux/file.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/net_namespace.h> #include <linux/sched/task.h> #include <linux/uidgid.h> #include <linux/cookie.h> #include <linux/proc_fs.h> #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> /* * Our network namespace constructor/destructor lists */ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); /* Protects net_namespace_list. Nests iside rtnl_lock() */ DECLARE_RWSEM(net_rwsem); EXPORT_SYMBOL_GPL(net_rwsem); #ifdef CONFIG_KEYS static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; #endif struct net init_net; EXPORT_SYMBOL(init_net); static bool init_net_initialized; /* * pernet_ops_rwsem: protects: pernet_list, net_generic_ids, * init_net_initialized and first_device pointer. * This is internal net namespace object. Please, don't use it * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; DEFINE_COOKIE(net_cookie); static struct net_generic *net_alloc_generic(void) { unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); unsigned int generic_size; struct net_generic *ng; generic_size = offsetof(struct net_generic, ptr[gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) ng->s.len = gen_ptrs; return ng; } static int net_assign_generic(struct net *net, unsigned int id, void *data) { struct net_generic *ng, *old_ng; BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); if (old_ng->s.len > id) { old_ng->ptr[id] = data; return 0; } ng = net_alloc_generic(); if (!ng) return -ENOMEM; /* * Some synchronisation notes: * * The net_generic explores the net->gen array inside rcu * read section. Besides once set the net->gen->ptr[x] * pointer never changes (see rules in netns/generic.h). * * That said, we simply duplicate this array and schedule * the old copy for kfree after a grace period. */ memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID], (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *)); ng->ptr[id] = data; rcu_assign_pointer(net->gen, ng); kfree_rcu(old_ng, s.rcu); return 0; } static int ops_init(const struct pernet_operations *ops, struct net *net) { struct net_generic *ng; int err = -ENOMEM; void *data = NULL; if (ops->id) { data = kzalloc(ops->size, GFP_KERNEL); if (!data) goto out; err = net_assign_generic(net, *ops->id, data); if (err) goto cleanup; } err = 0; if (ops->init) err = ops->init(net); if (!err) return 0; if (ops->id) { ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); ng->ptr[*ops->id] = NULL; } cleanup: kfree(data); out: return err; } static void ops_pre_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; if (ops->pre_exit) { list_for_each_entry(net, net_exit_list, exit_list) ops->pre_exit(net); } } static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; if (ops->exit) { list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } if (ops->exit_batch) ops->exit_batch(net_exit_list); } static void ops_free_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; if (ops->id) { list_for_each_entry(net, net_exit_list, exit_list) kfree(net_generic(net, *ops->id)); } } /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { int min = 0, max = 0; if (reqid >= 0) { min = reqid; max = reqid + 1; } return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC); } /* This function is used by idr_for_each(). If net is equal to peer, the * function returns the id so that idr_for_each() stops. Because we cannot * returns the id 0 (idr_for_each() will not stop), we return the magic value * NET_ID_ZERO (-1) for it. */ #define NET_ID_ZERO -1 static int net_eq_idr(int id, void *net, void *peer) { if (net_eq(net, peer)) return id ? : NET_ID_ZERO; return 0; } /* Must be called from RCU-critical section or with nsid_lock held */ static int __peernet2id(const struct net *net, struct net *peer) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); /* Magic value for id 0. */ if (id == NET_ID_ZERO) return 0; if (id > 0) return id; return NETNSA_NSID_NOT_ASSIGNED; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, struct nlmsghdr *nlh, gfp_t gfp); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; if (refcount_read(&net->ns.count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); id = __peernet2id(net, peer); if (id >= 0) { spin_unlock_bh(&net->nsid_lock); return id; } /* When peer is obtained from RCU lists, we may race with * its cleanup. Check whether it's alive, and this guarantees * we never hash a peer back to net->netns_ids, after it has * just been idr_remove()'d from there in cleanup_net(). */ if (!maybe_get_net(peer)) { spin_unlock_bh(&net->nsid_lock); return NETNSA_NSID_NOT_ASSIGNED; } id = alloc_netid(net, peer, -1); spin_unlock_bh(&net->nsid_lock); put_net(peer); if (id < 0) return NETNSA_NSID_NOT_ASSIGNED; rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); return id; } EXPORT_SYMBOL_GPL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ int peernet2id(const struct net *net, struct net *peer) { int id; rcu_read_lock(); id = __peernet2id(net, peer); rcu_read_unlock(); return id; } EXPORT_SYMBOL(peernet2id); /* This function returns true is the peer netns has an id assigned into the * current netns. */ bool peernet_has_id(const struct net *net, struct net *peer) { return peernet2id(net, peer) >= 0; } struct net *get_net_ns_by_id(const struct net *net, int id) { struct net *peer; if (id < 0) return NULL; rcu_read_lock(); peer = idr_find(&net->netns_ids, id); if (peer) peer = maybe_get_net(peer); rcu_read_unlock(); return peer; } EXPORT_SYMBOL_GPL(get_net_ns_by_id); static __net_init void preinit_net_sysctl(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; /* Limits per socket sk_omem_alloc usage. * TCP zerocopy regular usage needs 128 KB. */ net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; net->core.sysctl_tstamp_allow_data = 1; } /* init code that must occur even if setup_net() is not called. */ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) { refcount_set(&net->passive, 1); refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt"); get_random_bytes(&net->hash_mix, sizeof(u32)); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); spin_lock_init(&net->nsid_lock); mutex_init(&net->ipv4.ra_mutex); #ifdef CONFIG_DEBUG_NET_SMALL_RTNL mutex_init(&net->rtnl_mutex); lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); #endif preinit_net_sysctl(net); } /* * setup_net runs the initializers for the network namespace object. */ static __net_init int setup_net(struct net *net) { /* Must be called with pernet_ops_rwsem held */ const struct pernet_operations *ops, *saved_ops; LIST_HEAD(net_exit_list); LIST_HEAD(dev_kill_list); int error = 0; preempt_disable(); net->net_cookie = gen_cookie_next(&net_cookie); preempt_enable(); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); if (error < 0) goto out_undo; } down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); up_write(&net_rwsem); out: return error; out_undo: /* Walk through the list backwards calling the exit functions * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); saved_ops = ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_pre_exit_list(ops, &net_exit_list); synchronize_rcu(); ops = saved_ops; rtnl_lock(); list_for_each_entry_continue_reverse(ops, &pernet_list, list) { if (ops->exit_batch_rtnl) ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); } unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); ops = saved_ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); ops = saved_ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); rcu_barrier(); goto out; } #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES); } static void dec_net_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_NET_NAMESPACES); } static struct kmem_cache *net_cachep __ro_after_init; static struct workqueue_struct *netns_wq; static struct net *net_alloc(void) { struct net *net = NULL; struct net_generic *ng; ng = net_alloc_generic(); if (!ng) goto out; net = kmem_cache_zalloc(net_cachep, GFP_KERNEL); if (!net) goto out_free; #ifdef CONFIG_KEYS net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL); if (!net->key_domain) goto out_free_2; refcount_set(&net->key_domain->usage, 1); #endif rcu_assign_pointer(net->gen, ng); out: return net; #ifdef CONFIG_KEYS out_free_2: kmem_cache_free(net_cachep, net); net = NULL; #endif out_free: kfree(ng); goto out; } static LLIST_HEAD(defer_free_list); static void net_complete_free(void) { struct llist_node *kill_list; struct net *net, *next; /* Get the list of namespaces to free from last round. */ kill_list = llist_del_all(&defer_free_list); llist_for_each_entry_safe(net, next, kill_list, defer_free_list) kmem_cache_free(net_cachep, net); } static void net_free(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); /* There should not be any trackers left there. */ ref_tracker_dir_exit(&net->notrefcnt_tracker); /* Wait for an extra rcu_barrier() before final free. */ llist_add(&net->defer_free_list, &defer_free_list); } } void net_drop_ns(void *p) { struct net *net = (struct net *)p; if (net) net_free(net); } struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; struct net *net; int rv; if (!(flags & CLONE_NEWNET)) return get_net(old_net); ucounts = inc_net_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); net = net_alloc(); if (!net) { rv = -ENOMEM; goto dec_ucounts; } preinit_net(net, user_ns); net->ucounts = ucounts; get_user_ns(user_ns); rv = down_read_killable(&pernet_ops_rwsem); if (rv < 0) goto put_userns; rv = setup_net(net); up_read(&pernet_ops_rwsem); if (rv < 0) { put_userns: #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); net_free(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); } return net; } /** * net_ns_get_ownership - get sysfs ownership data for @net * @net: network namespace in question (can be NULL) * @uid: kernel user ID for sysfs objects * @gid: kernel group ID for sysfs objects * * Returns the uid/gid pair of root in the user namespace associated with the * given network namespace. */ void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) { if (net) { kuid_t ns_root_uid = make_kuid(net->user_ns, 0); kgid_t ns_root_gid = make_kgid(net->user_ns, 0); if (uid_valid(ns_root_uid)) *uid = ns_root_uid; if (gid_valid(ns_root_gid)) *gid = ns_root_gid; } else { *uid = GLOBAL_ROOT_UID; *gid = GLOBAL_ROOT_GID; } } EXPORT_SYMBOL_GPL(net_ns_get_ownership); static void unhash_nsid(struct net *net, struct net *last) { struct net *tmp; /* This function is only called from cleanup_net() work, * and this work is the only process, that may delete * a net from net_namespace_list. So, when the below * is executing, the list may only grow. Thus, we do not * use for_each_net_rcu() or net_rwsem. */ for_each_net(tmp) { int id; spin_lock_bh(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); spin_unlock_bh(&tmp->nsid_lock); if (id >= 0) rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, GFP_KERNEL); if (tmp == last) break; } spin_lock_bh(&net->nsid_lock); idr_destroy(&net->netns_ids); spin_unlock_bh(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; struct net *net, *tmp, *last; struct llist_node *net_kill_list; LIST_HEAD(net_exit_list); LIST_HEAD(dev_kill_list); /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); down_read(&pernet_ops_rwsem); /* Don't let anyone else find us. */ down_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) list_del_rcu(&net->list); /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer * to a net from net_kill_list (see peernet2id_alloc()). * So, we skip them in unhash_nsid(). * * Note, that unhash_nsid() does not delete nsid links * between net_kill_list's nets, as they've already * deleted from net_namespace_list. But, this would be * useless anyway, as netns_ids are destroyed there. */ last = list_last_entry(&net_namespace_list, struct net, list); up_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) { unhash_nsid(net, last); list_add_tail(&net->exit_list, &net_exit_list); } /* Run all of the network namespace pre_exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) ops_pre_exit_list(ops, &net_exit_list); /* * Another CPU might be rcu-iterating the list, wait for it. * This needs to be before calling the exit() notifiers, so * the rcu_barrier() below isn't sufficient alone. * Also the pre_exit() and exit() methods need this barrier. */ synchronize_rcu_expedited(); rtnl_lock(); list_for_each_entry_reverse(ops, &pernet_list, list) { if (ops->exit_batch_rtnl) ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); } unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); /* Free the net generic variables */ list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); up_read(&pernet_ops_rwsem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. */ rcu_barrier(); net_complete_free(); /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); net_free(net); } } /** * net_ns_barrier - wait until concurrent net_cleanup_work is done * * cleanup_net runs from work queue and will first remove namespaces * from the global list, then run net exit functions. * * Call this in module exit path to make sure that all netns * ->exit ops have been invoked before the function is removed. */ void net_ns_barrier(void) { down_write(&pernet_ops_rwsem); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL(net_ns_barrier); static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { ref_tracker_dir_exit(&net->refcnt_tracker); /* Cleanup the network namespace in process context */ if (llist_add(&net->cleanup_list, &cleanup_list)) queue_work(netns_wq, &net_cleanup_work); } EXPORT_SYMBOL_GPL(__put_net); /** * get_net_ns - increment the refcount of the network namespace * @ns: common namespace (net) * * Returns the net's common namespace or ERR_PTR() if ref is zero. */ struct ns_common *get_net_ns(struct ns_common *ns) { struct net *net; net = maybe_get_net(container_of(ns, struct net, ns)); if (net) return &net->ns; return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns); struct net *get_net_ns_by_fd(int fd) { CLASS(fd, f)(fd); if (fd_empty(f)) return ERR_PTR(-EBADF); if (proc_ns_file(fd_file(f))) { struct ns_common *ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ops == &netns_operations) return get_net(container_of(ns, struct net, ns)); } return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns_by_fd); #endif struct net *get_net_ns_by_pid(pid_t pid) { struct task_struct *tsk; struct net *net; /* Lookup the network namespace */ net = ERR_PTR(-ESRCH); rcu_read_lock(); tsk = find_task_by_vpid(pid); if (tsk) { struct nsproxy *nsproxy; task_lock(tsk); nsproxy = tsk->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); task_unlock(tsk); } rcu_read_unlock(); return net; } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); static __net_init int net_ns_net_init(struct net *net) { #ifdef CONFIG_NET_NS net->ns.ops = &netns_operations; #endif return ns_alloc_inum(&net->ns); } static __net_exit void net_ns_net_exit(struct net *net) { ns_free_inum(&net->ns); } static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, .exit = net_ns_net_exit, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NONE] = { .type = NLA_UNSPEC }, [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, [NETNSA_FD] = { .type = NLA_U32 }, [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, }; static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; struct nlattr *nla; struct net *peer; int nsid, err; err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); if (err < 0) return err; if (!tb[NETNSA_NSID]) { NL_SET_ERR_MSG(extack, "nsid is missing"); return -EINVAL; } nsid = nla_get_s32(tb[NETNSA_NSID]); if (tb[NETNSA_PID]) { peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); nla = tb[NETNSA_PID]; } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; } if (IS_ERR(peer)) { NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); return PTR_ERR(peer); } spin_lock_bh(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { spin_unlock_bh(&net->nsid_lock); err = -EEXIST; NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, "Peer netns already has a nsid assigned"); goto out; } err = alloc_netid(net, peer, nsid); spin_unlock_bh(&net->nsid_lock); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, nlh, GFP_KERNEL); err = 0; } else if (err == -ENOSPC && nsid >= 0) { err = -EEXIST; NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]); NL_SET_ERR_MSG(extack, "The specified nsid is already used"); } out: put_net(peer); return err; } static int rtnl_net_get_size(void) { return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ ; } struct net_fill_args { u32 portid; u32 seq; int flags; int cmd; int nsid; bool add_ref; int ref_nsid; }; static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) { struct nlmsghdr *nlh; struct rtgenmsg *rth; nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), args->flags); if (!nlh) return -EMSGSIZE; rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) goto nla_put_failure; if (args->add_ref && nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int rtnl_net_valid_getid_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); if (err) return err; for (i = 0; i <= NETNSA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETNSA_PID: case NETNSA_FD: case NETNSA_NSID: case NETNSA_TARGET_NSID: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request"); return -EINVAL; } } return 0; } static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; struct net_fill_args fillargs = { .portid = NETLINK_CB(skb).portid, .seq = nlh->nlmsg_seq, .cmd = RTM_NEWNSID, }; struct net *peer, *target = net; struct nlattr *nla; struct sk_buff *msg; int err; err = rtnl_net_valid_getid_req(skb, nlh, tb, extack); if (err < 0) return err; if (tb[NETNSA_PID]) { peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); nla = tb[NETNSA_PID]; } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; } else if (tb[NETNSA_NSID]) { peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID])); if (!peer) peer = ERR_PTR(-ENOENT); nla = tb[NETNSA_NSID]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; } if (IS_ERR(peer)) { NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); return PTR_ERR(peer); } if (tb[NETNSA_TARGET_NSID]) { int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); if (IS_ERR(target)) { NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); NL_SET_ERR_MSG(extack, "Target netns reference is invalid"); err = PTR_ERR(target); goto out; } fillargs.add_ref = true; fillargs.ref_nsid = peernet2id(net, peer); } msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } fillargs.nsid = peernet2id(target, peer); err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid); goto out; err_out: nlmsg_free(msg); out: if (fillargs.add_ref) put_net(target); put_net(peer); return err; } struct rtnl_net_dump_cb { struct net *tgt_net; struct net *ref_net; struct sk_buff *skb; struct net_fill_args fillargs; int idx; int s_idx; }; /* Runs in RCU-critical section. */ static int rtnl_net_dumpid_one(int id, void *peer, void *data) { struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data; int ret; if (net_cb->idx < net_cb->s_idx) goto cont; net_cb->fillargs.nsid = id; if (net_cb->fillargs.add_ref) net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; cont: net_cb->idx++; return 0; } static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, struct rtnl_net_dump_cb *net_cb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[NETNSA_MAX + 1]; int err, i; err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); if (err < 0) return err; for (i = 0; i <= NETNSA_MAX; i++) { if (!tb[i]) continue; if (i == NETNSA_TARGET_NSID) { struct net *net; net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); if (IS_ERR(net)) { NL_SET_BAD_ATTR(extack, tb[i]); NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); return PTR_ERR(net); } net_cb->fillargs.add_ref = true; net_cb->ref_net = net_cb->tgt_net; net_cb->tgt_net = net; } else { NL_SET_BAD_ATTR(extack, tb[i]); NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; } } return 0; } static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { struct rtnl_net_dump_cb net_cb = { .tgt_net = sock_net(skb->sk), .skb = skb, .fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = cb->nlh->nlmsg_seq, .flags = NLM_F_MULTI, .cmd = RTM_NEWNSID, }, .idx = 0, .s_idx = cb->args[0], }; int err = 0; if (cb->strict_check) { err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); if (err < 0) goto end; } rcu_read_lock(); idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); rcu_read_unlock(); cb->args[0] = net_cb.idx; end: if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); return err; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, struct nlmsghdr *nlh, gfp_t gfp) { struct net_fill_args fillargs = { .portid = portid, .seq = nlh ? nlh->nlmsg_seq : 0, .cmd = cmd, .nsid = id, }; struct sk_buff *msg; int err = -ENOMEM; msg = nlmsg_new(rtnl_net_get_size(), gfp); if (!msg) goto out; err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp); return; err_out: nlmsg_free(msg); out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } #ifdef CONFIG_NET_NS static void __init netns_ipv4_struct_check(void) { /* TX readonly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_early_retrans); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_tso_win_divisor); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_tso_rtt_log); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_autocorking); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_min_snd_mss); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_limit_output_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_min_rtt_wlen); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_wmem); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_ip_fwd_use_pmtu); CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33); /* TXRX readonly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx, sysctl_tcp_moderate_rcvbuf); CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1); /* RX readonly hotpath cache line */ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_ip_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_l3mdev_accept); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_rmem); CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); } #endif static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid, .dumpit = rtnl_net_dumpid, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; void __init net_ns_init(void) { struct net_generic *ng; #ifdef CONFIG_NET_NS netns_ipv4_struct_check(); net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Create workqueue for cleanup */ netns_wq = create_singlethread_workqueue("netns"); if (!netns_wq) panic("Could not create netns workq"); #endif ng = net_alloc_generic(); if (!ng) panic("Could not allocate generic netns"); rcu_assign_pointer(init_net.gen, ng); #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif preinit_net(&init_net, &init_user_ns); down_write(&pernet_ops_rwsem); if (setup_net(&init_net)) panic("Could not setup the initial network namespace"); init_net_initialized = true; up_write(&pernet_ops_rwsem); if (register_pernet_subsys(&net_ns_ops)) panic("Could not register network namespace subsystems"); rtnl_register_many(net_ns_rtnl_msg_handlers); } static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) { ops_pre_exit_list(ops, net_exit_list); synchronize_rcu(); if (ops->exit_batch_rtnl) { LIST_HEAD(dev_kill_list); rtnl_lock(); ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); } ops_exit_list(ops, net_exit_list); ops_free_list(ops, net_exit_list); } #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { struct net *net; int error; LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); if (ops->init || ops->id) { /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { error = ops_init(ops, net); if (error) goto out_undo; list_add_tail(&net->exit_list, &net_exit_list); } } return 0; out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); free_exit_list(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { struct net *net; LIST_HEAD(net_exit_list); list_del(&ops->list); /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); free_exit_list(ops, &net_exit_list); } #else static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { if (!init_net_initialized) { list_add_tail(&ops->list, list); return 0; } return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) { if (!init_net_initialized) { list_del(&ops->list); } else { LIST_HEAD(net_exit_list); list_add(&init_net.exit_list, &net_exit_list); free_exit_list(ops, &net_exit_list); } } #endif /* CONFIG_NET_NS */ static DEFINE_IDA(net_generic_ids); static int register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { int error; if (WARN_ON(!!ops->id ^ !!ops->size)) return -EINVAL; if (ops->id) { error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID, GFP_KERNEL); if (error < 0) return error; *ops->id = error; /* This does not require READ_ONCE as writers already hold * pernet_ops_rwsem. But WRITE_ONCE is needed to protect * net_alloc_generic. */ WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1)); } error = __register_pernet_operations(list, ops); if (error) { rcu_barrier(); if (ops->id) ida_free(&net_generic_ids, *ops->id); } return error; } static void unregister_pernet_operations(struct pernet_operations *ops) { __unregister_pernet_operations(ops); rcu_barrier(); if (ops->id) ida_free(&net_generic_ids, *ops->id); } /** * register_pernet_subsys - register a network namespace subsystem * @ops: pernet operations structure for the subsystem * * Register a subsystem which has init and exit functions * that are called when network namespaces are created and * destroyed respectively. * * When registered all network namespace init functions are * called for every existing network namespace. Allowing kernel * modules to have a race free view of the set of network namespaces. * * When a new network namespace is created all of the init * methods are called in the order in which they were registered. * * When a network namespace is destroyed all of the exit methods * are called in the reverse of the order with which they were * registered. */ int register_pernet_subsys(struct pernet_operations *ops) { int error; down_write(&pernet_ops_rwsem); error = register_pernet_operations(first_device, ops); up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_subsys); /** * unregister_pernet_subsys - unregister a network namespace subsystem * @ops: pernet operations structure to manipulate * * Remove the pernet operations structure from the list to be * used when network namespaces are created or destroyed. In * addition run the exit method for all existing network * namespaces. */ void unregister_pernet_subsys(struct pernet_operations *ops) { down_write(&pernet_ops_rwsem); unregister_pernet_operations(ops); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); /** * register_pernet_device - register a network namespace device * @ops: pernet operations structure for the subsystem * * Register a device which has init and exit functions * that are called when network namespaces are created and * destroyed respectively. * * When registered all network namespace init functions are * called for every existing network namespace. Allowing kernel * modules to have a race free view of the set of network namespaces. * * When a new network namespace is created all of the init * methods are called in the order in which they were registered. * * When a network namespace is destroyed all of the exit methods * are called in the reverse of the order with which they were * registered. */ int register_pernet_device(struct pernet_operations *ops) { int error; down_write(&pernet_ops_rwsem); error = register_pernet_operations(&pernet_list, ops); if (!error && (first_device == &pernet_list)) first_device = &ops->list; up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_device); /** * unregister_pernet_device - unregister a network namespace netdevice * @ops: pernet operations structure to manipulate * * Remove the pernet operations structure from the list to be * used when network namespaces are created or destroyed. In * addition run the exit method for all existing network * namespaces. */ void unregister_pernet_device(struct pernet_operations *ops) { down_write(&pernet_ops_rwsem); if (&ops->list == first_device) first_device = first_device->next; unregister_pernet_operations(ops); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_device); #ifdef CONFIG_NET_NS static struct ns_common *netns_get(struct task_struct *task) { struct net *net = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); task_unlock(task); return net ? &net->ns : NULL; } static inline struct net *to_net_ns(struct ns_common *ns) { return container_of(ns, struct net, ns); } static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); } static int netns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct net *net = to_net_ns(ns); if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_net(nsproxy->net_ns); nsproxy->net_ns = get_net(net); return 0; } static struct user_namespace *netns_owner(struct ns_common *ns) { return to_net_ns(ns)->user_ns; } const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, .owner = netns_owner, }; #endif
45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 // SPDX-License-Identifier: GPL-2.0 /* Bareudp: UDP tunnel encasulation for different Payload types like * MPLS, NSH, IP, etc. * Copyright (c) 2019 Nokia, Inc. * Authors: Martin Varghese, <martin.varghese@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/hash.h> #include <net/dst_metadata.h> #include <net/gro_cells.h> #include <net/rtnetlink.h> #include <net/protocol.h> #include <net/ip6_tunnel.h> #include <net/ip_tunnels.h> #include <net/udp_tunnel.h> #include <net/bareudp.h> #define BAREUDP_BASE_HLEN sizeof(struct udphdr) #define BAREUDP_IPV4_HLEN (sizeof(struct iphdr) + \ sizeof(struct udphdr)) #define BAREUDP_IPV6_HLEN (sizeof(struct ipv6hdr) + \ sizeof(struct udphdr)) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); /* per-network namespace private data for this module */ static unsigned int bareudp_net_id; struct bareudp_net { struct list_head bareudp_list; }; struct bareudp_conf { __be16 ethertype; __be16 port; u16 sport_min; bool multi_proto_mode; }; /* Pseudo network device */ struct bareudp_dev { struct net *net; /* netns for packet i/o */ struct net_device *dev; /* netdev for bareudp tunnel */ __be16 ethertype; __be16 port; u16 sport_min; bool multi_proto_mode; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; }; static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct metadata_dst *tun_dst = NULL; IP_TUNNEL_DECLARE_FLAGS(key) = { }; struct bareudp_dev *bareudp; unsigned short family; unsigned int len; __be16 proto; void *oiph; int err; int nh; bareudp = rcu_dereference_sk_user_data(sk); if (!bareudp) goto drop; if (skb->protocol == htons(ETH_P_IP)) family = AF_INET; else family = AF_INET6; if (bareudp->ethertype == htons(ETH_P_IP)) { __u8 ipversion; if (skb_copy_bits(skb, BAREUDP_BASE_HLEN, &ipversion, sizeof(ipversion))) { dev_core_stats_rx_dropped_inc(bareudp->dev); goto drop; } ipversion >>= 4; if (ipversion == 4) { proto = htons(ETH_P_IP); } else if (ipversion == 6 && bareudp->multi_proto_mode) { proto = htons(ETH_P_IPV6); } else { dev_core_stats_rx_dropped_inc(bareudp->dev); goto drop; } } else if (bareudp->ethertype == htons(ETH_P_MPLS_UC)) { struct iphdr *tunnel_hdr; tunnel_hdr = (struct iphdr *)skb_network_header(skb); if (tunnel_hdr->version == 4) { if (!ipv4_is_multicast(tunnel_hdr->daddr)) { proto = bareudp->ethertype; } else if (bareudp->multi_proto_mode && ipv4_is_multicast(tunnel_hdr->daddr)) { proto = htons(ETH_P_MPLS_MC); } else { dev_core_stats_rx_dropped_inc(bareudp->dev); goto drop; } } else { int addr_type; struct ipv6hdr *tunnel_hdr_v6; tunnel_hdr_v6 = (struct ipv6hdr *)skb_network_header(skb); addr_type = ipv6_addr_type((struct in6_addr *)&tunnel_hdr_v6->daddr); if (!(addr_type & IPV6_ADDR_MULTICAST)) { proto = bareudp->ethertype; } else if (bareudp->multi_proto_mode && (addr_type & IPV6_ADDR_MULTICAST)) { proto = htons(ETH_P_MPLS_MC); } else { dev_core_stats_rx_dropped_inc(bareudp->dev); goto drop; } } } else { proto = bareudp->ethertype; } if (iptunnel_pull_header(skb, BAREUDP_BASE_HLEN, proto, !net_eq(bareudp->net, dev_net(bareudp->dev)))) { dev_core_stats_rx_dropped_inc(bareudp->dev); goto drop; } __set_bit(IP_TUNNEL_KEY_BIT, key); tun_dst = udp_tun_rx_dst(skb, family, key, 0, 0); if (!tun_dst) { dev_core_stats_rx_dropped_inc(bareudp->dev); goto drop; } skb_dst_set(skb, &tun_dst->dst); skb->dev = bareudp->dev; skb_reset_mac_header(skb); /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(bareudp->dev, rx_length_errors); DEV_STATS_INC(bareudp->dev, rx_errors); goto drop; } /* Get the outer header. */ oiph = skb->head + nh; if (!ipv6_mod_enabled() || family == AF_INET) err = IP_ECN_decapsulate(oiph, skb); else err = IP6_ECN_decapsulate(oiph, skb); if (unlikely(err)) { if (log_ecn_error) { if (!ipv6_mod_enabled() || family == AF_INET) net_info_ratelimited("non-ECT from %pI4 " "with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); } if (err > 1) { DEV_STATS_INC(bareudp->dev, rx_frame_errors); DEV_STATS_INC(bareudp->dev, rx_errors); goto drop; } } len = skb->len; err = gro_cells_receive(&bareudp->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) dev_sw_netstats_rx_add(bareudp->dev, len); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; } static int bareudp_err_lookup(struct sock *sk, struct sk_buff *skb) { return 0; } static int bareudp_init(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); int err; err = gro_cells_init(&bareudp->gro_cells, dev); if (err) return err; return 0; } static void bareudp_uninit(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); gro_cells_destroy(&bareudp->gro_cells); } static struct socket *bareudp_create_sock(struct net *net, __be16 port) { struct udp_port_cfg udp_conf; struct socket *sock; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6_mod_enabled()) udp_conf.family = AF_INET6; else udp_conf.family = AF_INET; udp_conf.local_udp_port = port; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } /* Create new listen socket if needed */ static int bareudp_socket_create(struct bareudp_dev *bareudp, __be16 port) { struct udp_tunnel_sock_cfg tunnel_cfg; struct socket *sock; sock = bareudp_create_sock(bareudp->net, port); if (IS_ERR(sock)) return PTR_ERR(sock); /* Mark socket as an encapsulation socket */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = bareudp; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = bareudp_udp_encap_recv; tunnel_cfg.encap_err_lookup = bareudp_err_lookup; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(bareudp->net, sock, &tunnel_cfg); rcu_assign_pointer(bareudp->sock, sock); return 0; } static int bareudp_open(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); int ret = 0; ret = bareudp_socket_create(bareudp, bareudp->port); return ret; } static void bareudp_sock_release(struct bareudp_dev *bareudp) { struct socket *sock; sock = bareudp->sock; rcu_assign_pointer(bareudp->sock, NULL); synchronize_net(); udp_tunnel_sock_release(sock); } static int bareudp_stop(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); bareudp_sock_release(bareudp); return 0; } static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct bareudp_dev *bareudp, const struct ip_tunnel_info *info) { bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev)); bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct socket *sock = rcu_dereference(bareudp->sock); const struct ip_tunnel_key *key = &info->key; struct rtable *rt; __be16 sport, df; int min_headroom; __u8 tos, ttl; __be32 saddr; int err; if (skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) return -EINVAL; if (!sock) return -ESHUTDOWN; sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, 0, &saddr, &info->key, sport, bareudp->port, key->tos, use_cache ? (struct dst_cache *)&info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); skb_tunnel_check_pmtu(skb, &rt->dst, BAREUDP_IPV4_HLEN + info->options_len, false); tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); ttl = key->ttl; df = test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) ? htons(IP_DF) : 0; skb_scrub_packet(skb, xnet); err = -ENOSPC; if (!skb_pull(skb, skb_network_offset(skb))) goto free_dst; min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct iphdr); err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; skb_set_inner_protocol(skb, bareudp->ethertype); udp_tunnel_xmit_skb(rt, sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, bareudp->port, !net_eq(bareudp->net, dev_net(bareudp->dev)), !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)); return 0; free_dst: dst_release(&rt->dst); return err; } static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct bareudp_dev *bareudp, const struct ip_tunnel_info *info) { bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev)); bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct socket *sock = rcu_dereference(bareudp->sock); const struct ip_tunnel_key *key = &info->key; struct dst_entry *dst = NULL; struct in6_addr saddr, daddr; int min_headroom; __u8 prio, ttl; __be16 sport; int err; if (skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) return -EINVAL; if (!sock) return -ESHUTDOWN; sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, 0, &saddr, key, sport, bareudp->port, key->tos, use_cache ? (struct dst_cache *) &info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len, false); prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); ttl = key->ttl; skb_scrub_packet(skb, xnet); err = -ENOSPC; if (!skb_pull(skb, skb_network_offset(skb))) goto free_dst; min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct ipv6hdr); err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; daddr = info->key.u.ipv6.dst; udp_tunnel6_xmit_skb(dst, sock->sk, skb, dev, &saddr, &daddr, prio, ttl, info->key.label, sport, bareudp->port, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)); return 0; free_dst: dst_release(dst); return err; } static bool bareudp_proto_valid(struct bareudp_dev *bareudp, __be16 proto) { if (bareudp->ethertype == proto) return true; if (!bareudp->multi_proto_mode) return false; if (bareudp->ethertype == htons(ETH_P_MPLS_UC) && proto == htons(ETH_P_MPLS_MC)) return true; if (bareudp->ethertype == htons(ETH_P_IP) && proto == htons(ETH_P_IPV6)) return true; return false; } static netdev_tx_t bareudp_xmit(struct sk_buff *skb, struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); struct ip_tunnel_info *info = NULL; int err; if (!bareudp_proto_valid(bareudp, skb->protocol)) { err = -EINVAL; goto tx_error; } info = skb_tunnel_info(skb); if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { err = -EINVAL; goto tx_error; } rcu_read_lock(); if (ipv6_mod_enabled() && info->mode & IP_TUNNEL_INFO_IPV6) err = bareudp6_xmit_skb(skb, dev, bareudp, info); else err = bareudp_xmit_skb(skb, dev, bareudp, info); rcu_read_unlock(); if (likely(!err)) return NETDEV_TX_OK; tx_error: dev_kfree_skb(skb); if (err == -ELOOP) DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static int bareudp_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); struct bareudp_dev *bareudp = netdev_priv(dev); bool use_cache; __be16 sport; use_cache = ip_tunnel_dst_cache_usable(skb, info); sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); if (!ipv6_mod_enabled() || ip_tunnel_info_af(info) == AF_INET) { struct rtable *rt; __be32 saddr; rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, 0, &saddr, &info->key, sport, bareudp->port, info->key.tos, use_cache ? &info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); info->key.u.ipv4.src = saddr; } else if (ip_tunnel_info_af(info) == AF_INET6) { struct dst_entry *dst; struct in6_addr saddr; struct socket *sock = rcu_dereference(bareudp->sock); dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, 0, &saddr, &info->key, sport, bareudp->port, info->key.tos, use_cache ? &info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); dst_release(dst); info->key.u.ipv6.src = saddr; } else { return -EINVAL; } info->key.tp_src = sport; info->key.tp_dst = bareudp->port; return 0; } static const struct net_device_ops bareudp_netdev_ops = { .ndo_init = bareudp_init, .ndo_uninit = bareudp_uninit, .ndo_open = bareudp_open, .ndo_stop = bareudp_stop, .ndo_start_xmit = bareudp_xmit, .ndo_fill_metadata_dst = bareudp_fill_metadata_dst, }; static const struct nla_policy bareudp_policy[IFLA_BAREUDP_MAX + 1] = { [IFLA_BAREUDP_PORT] = { .type = NLA_U16 }, [IFLA_BAREUDP_ETHERTYPE] = { .type = NLA_U16 }, [IFLA_BAREUDP_SRCPORT_MIN] = { .type = NLA_U16 }, [IFLA_BAREUDP_MULTIPROTO_MODE] = { .type = NLA_FLAG }, }; /* Info for udev, that this is a virtual tunnel endpoint */ static const struct device_type bareudp_type = { .name = "bareudp", }; /* Initialize the device structure. */ static void bareudp_setup(struct net_device *dev) { dev->netdev_ops = &bareudp_netdev_ops; dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &bareudp_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = ETH_DATA_LEN; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - BAREUDP_BASE_HLEN; dev->type = ARPHRD_NONE; netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; } static int bareudp_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (!data) { NL_SET_ERR_MSG(extack, "Not enough attributes provided to perform the operation"); return -EINVAL; } return 0; } static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, struct netlink_ext_ack *extack) { memset(conf, 0, sizeof(*conf)); if (!data[IFLA_BAREUDP_PORT]) { NL_SET_ERR_MSG(extack, "port not specified"); return -EINVAL; } if (!data[IFLA_BAREUDP_ETHERTYPE]) { NL_SET_ERR_MSG(extack, "ethertype not specified"); return -EINVAL; } conf->port = nla_get_u16(data[IFLA_BAREUDP_PORT]); conf->ethertype = nla_get_u16(data[IFLA_BAREUDP_ETHERTYPE]); if (data[IFLA_BAREUDP_SRCPORT_MIN]) conf->sport_min = nla_get_u16(data[IFLA_BAREUDP_SRCPORT_MIN]); if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) conf->multi_proto_mode = true; return 0; } static struct bareudp_dev *bareudp_find_dev(struct bareudp_net *bn, const struct bareudp_conf *conf) { struct bareudp_dev *bareudp, *t = NULL; list_for_each_entry(bareudp, &bn->bareudp_list, next) { if (conf->port == bareudp->port) t = bareudp; } return t; } static int bareudp_configure(struct net *net, struct net_device *dev, struct bareudp_conf *conf, struct netlink_ext_ack *extack) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); struct bareudp_dev *t, *bareudp = netdev_priv(dev); int err; bareudp->net = net; bareudp->dev = dev; t = bareudp_find_dev(bn, conf); if (t) { NL_SET_ERR_MSG(extack, "Another bareudp device using the same port already exists"); return -EBUSY; } if (conf->multi_proto_mode && (conf->ethertype != htons(ETH_P_MPLS_UC) && conf->ethertype != htons(ETH_P_IP))) { NL_SET_ERR_MSG(extack, "Cannot set multiproto mode for this ethertype (only IPv4 and unicast MPLS are supported)"); return -EINVAL; } bareudp->port = conf->port; bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; bareudp->multi_proto_mode = conf->multi_proto_mode; err = register_netdevice(dev); if (err) return err; list_add(&bareudp->next, &bn->bareudp_list); return 0; } static int bareudp_link_config(struct net_device *dev, struct nlattr *tb[]) { int err; if (tb[IFLA_MTU]) { err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); if (err) return err; } return 0; } static void bareudp_dellink(struct net_device *dev, struct list_head *head) { struct bareudp_dev *bareudp = netdev_priv(dev); list_del(&bareudp->next); unregister_netdevice_queue(dev, head); } static int bareudp_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct bareudp_conf conf; int err; err = bareudp2info(data, &conf, extack); if (err) return err; err = bareudp_configure(net, dev, &conf, extack); if (err) return err; err = bareudp_link_config(dev, tb); if (err) goto err_unconfig; return 0; err_unconfig: bareudp_dellink(dev, NULL); return err; } static size_t bareudp_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_PORT */ nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ 0; } static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); if (nla_put_be16(skb, IFLA_BAREUDP_PORT, bareudp->port)) goto nla_put_failure; if (nla_put_be16(skb, IFLA_BAREUDP_ETHERTYPE, bareudp->ethertype)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BAREUDP_SRCPORT_MIN, bareudp->sport_min)) goto nla_put_failure; if (bareudp->multi_proto_mode && nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops bareudp_link_ops __read_mostly = { .kind = "bareudp", .maxtype = IFLA_BAREUDP_MAX, .policy = bareudp_policy, .priv_size = sizeof(struct bareudp_dev), .setup = bareudp_setup, .validate = bareudp_validate, .newlink = bareudp_newlink, .dellink = bareudp_dellink, .get_size = bareudp_get_size, .fill_info = bareudp_fill_info, }; static __net_init int bareudp_init_net(struct net *net) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); INIT_LIST_HEAD(&bn->bareudp_list); return 0; } static void bareudp_destroy_tunnels(struct net *net, struct list_head *head) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); struct bareudp_dev *bareudp, *next; list_for_each_entry_safe(bareudp, next, &bn->bareudp_list, next) unregister_netdevice_queue(bareudp->dev, head); } static void __net_exit bareudp_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_kill_list) { struct net *net; list_for_each_entry(net, net_list, exit_list) bareudp_destroy_tunnels(net, dev_kill_list); } static struct pernet_operations bareudp_net_ops = { .init = bareudp_init_net, .exit_batch_rtnl = bareudp_exit_batch_rtnl, .id = &bareudp_net_id, .size = sizeof(struct bareudp_net), }; static int __init bareudp_init_module(void) { int rc; rc = register_pernet_subsys(&bareudp_net_ops); if (rc) goto out1; rc = rtnl_link_register(&bareudp_link_ops); if (rc) goto out2; return 0; out2: unregister_pernet_subsys(&bareudp_net_ops); out1: return rc; } late_initcall(bareudp_init_module); static void __exit bareudp_cleanup_module(void) { rtnl_link_unregister(&bareudp_link_ops); unregister_pernet_subsys(&bareudp_net_ops); } module_exit(bareudp_cleanup_module); MODULE_ALIAS_RTNL_LINK("bareudp"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Varghese <martin.varghese@nokia.com>"); MODULE_DESCRIPTION("Interface driver for UDP encapsulated traffic");
1462 1465 1 1 1 1 54 1 54 53 54 54 54 1 1 1 1 1 1 1 1 9 597 588 590 114 114 274 276 276 593 565 273 315 302 275 272 276 272 274 275 276 276 273 274 276 1414 1409 1410 276 273 275 276 274 275 276 275 276 275 276 273 276 274 276 275 275 275 276 275 275 276 276 276 276 274 275 275 275 273 276 275 274 275 276 275 275 276 52 52 52 52 51 53 53 53 8 8 54 52 54 53 47 54 53 52 52 53 54 54 54 54 54 47 54 54 6 54 54 54 54 54 54 54 51 54 1 54 54 54 54 54 54 1 3341 3348 3345 3344 3341 3343 4758 4754 4759 4759 4757 4765 4756 4757 4757 4758 4767 4754 4760 4757 4757 4767 4753 4760 4767 4760 4767 4749 4760 3399 8 132 132 132 132 132 4 1270 148 152 519 294 228 519 1 294 228 519 227 294 520 520 520 519 293 228 521 9 521 293 229 293 8 229 228 521 233 48 236 132 132 527 528 525 235 294 236 132 132 132 521 228 521 27 27 27 3099 9 941 942 3085 2204 2200 2199 945 149 945 942 945 942 939 941 3097 3093 6 4 4 4 9 9 9 9 9 9 9 9 9 4 4 9 9 4 3396 333 531 2922 3108 530 1 8 526 5 12 3406 3405 3258 156 3402 3397 3244 329 3399 3403 4 3402 3404 2204 1421 3402 3398 3413 3255 222 3404 3414 3253 222 222 3413 4201 8 9 9 9 4194 4201 3276 1063 9 228 229 7 8 221 15 275 275 275 428 428 425 10 10 10 10 10 19362 454 19351 592 598 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * demand-loading started 01.12.91 - seems it is high on the list of * things wanted, and it should be easy to implement. - Linus */ /* * Ok, demand-loading was easy, shared pages a little bit tricker. Shared * pages started 02.12.91, seems to work. - Linus. * * Tested sharing by executing about 30 /bin/sh: under the old kernel it * would have taken more than the 6M I have free, but it worked well as * far as I could see. * * Also corrected some "invalidate()"s - I wasn't doing enough of them. */ /* * Real VM (paging to/from disk) started 18.12.91. Much more work and * thought has to go into this. Oh, well.. * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. * Found it. Everything seems to work now. * 20.12.91 - Ok, making the swap-device changeable like the root. */ /* * 05.04.94 - Multi-page memory management added for v1.1. * Idea by Alex Bligh (alex@cconcepts.co.uk) * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) * * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) */ #include <linux/kernel_stat.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/memremap.h> #include <linux/kmsan.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> #include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/swapops.h> #include <linux/elf.h> #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> #include <linux/memory-tiers.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> #include <linux/dax.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> #include <trace/events/kmem.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include "pgalloc-track.h" #include "internal.h" #include "swap.h" #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif #ifndef CONFIG_NUMA unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); struct page *mem_map; EXPORT_SYMBOL(mem_map); #endif static vm_fault_t do_fault(struct vm_fault *vmf); static vm_fault_t do_anonymous_page(struct vm_fault *vmf); static bool vmf_pte_changed(struct vm_fault *vmf); /* * Return true if the original pte was a uffd-wp pte marker (so the pte was * wr-protected). */ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) { if (!userfaultfd_wp(vmf->vma)) return false; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; return pte_marker_uffd_wp(vmf->orig_pte); } /* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end * of ZONE_NORMAL. */ void *high_memory; EXPORT_SYMBOL(high_memory); /* * Randomize the address space (stacks, mmaps, brk, etc.). * * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, * as ancient (libc5 based) binaries can segfault. ) */ int randomize_va_space __read_mostly = #ifdef CONFIG_COMPAT_BRK 1; #else 2; #endif #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { /* * Transitioning a PTE from 'old' to 'young' can be expensive on * some architectures, even if it's performed in hardware. By * default, "false" means prefaulted entries will be 'young'. */ return false; } #endif static int __init disable_randmaps(char *s) { randomize_va_space = 0; return 1; } __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; EXPORT_SYMBOL(zero_pfn); unsigned long highest_memmap_pfn __read_mostly; /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ static int __init init_zero_pfn(void) { zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } early_initcall(init_zero_pfn); void mm_trace_rss_stat(struct mm_struct *mm, int member) { trace_rss_stat(mm, member); } /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; unsigned long start; start = addr; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; if (start < floor) return; if (ceiling) { ceiling &= PUD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); mm_dec_nr_pmds(tlb->mm); } static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pud_t *pud; unsigned long next; unsigned long start; start = addr; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); start &= P4D_MASK; if (start < floor) return; if (ceiling) { ceiling &= P4D_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { p4d_t *p4d; unsigned long next; unsigned long start; start = addr; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; free_pud_range(tlb, p4d, addr, next, floor, ceiling); } while (p4d++, addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) return; if (ceiling) { ceiling &= PGDIR_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; p4d = p4d_offset(pgd, start); pgd_clear(pgd); p4d_free_tlb(tlb, p4d, start); } /* * This function frees user-level page tables of a process. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; /* * The next few lines have given us lots of grief... * * Why are we testing PMD* at this top level? Because often * there will be no work to do at all, and we'd prefer not to * go all the way down to the bottom just to discover that. * * Why all these "- 1"s? Because 0 represents both the bottom * of the address space and the top of it (using -1 for the * top wouldn't help much: the masks would do the wrong thing). * The rule is that addr 0 and floor 0 refer to the bottom of * the address space, but end 0 and ceiling 0 refer to the top * Comparisons need to use "end - 1" and "ceiling - 1" (though * that end 0 case should be mythical). * * Wherever addr is brought up or ceiling brought down, we must * be careful to reject "the opposite 0" before it confuses the * subsequent tests. But what about where end is brought down * by PMD_SIZE below? no, end can't go down to 0 there. * * Whereas we round start (addr) and ceiling down, by different * masks at different levels, in order to test whether a table * now has no other vmas using it, so can be freed, we don't * bother to round floor or end up - the tests don't need that. */ addr &= PMD_MASK; if (addr < floor) { addr += PMD_SIZE; if (!addr) return; } if (ceiling) { ceiling &= PMD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) end -= PMD_SIZE; if (addr > end - 1) return; /* * We add page table cache pages with PAGE_SIZE, * (see pte_free_tlb()), flush the tlb if we need */ tlb_change_page_size(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; free_p4d_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked) { struct unlink_vma_file_batch vb; do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; /* * Note: USER_PGTABLES_CEILING may be passed as ceiling and may * be 0. This will underflow and is okay. */ next = mas_find(mas, ceiling - 1); if (unlikely(xa_is_zero(next))) next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); if (is_vm_hugetlb_page(vma)) { unlink_file_vma(vma); hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } else { unlink_file_vma_batch_init(&vb); unlink_file_vma_batch_add(&vb, vma); /* * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(mas, ceiling - 1); if (unlikely(xa_is_zero(next))) next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); unlink_file_vma_batch_add(&vb, vma); } unlink_file_vma_batch_final(&vb); free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } vma = next; } while (vma); } void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) { spinlock_t *ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm_inc_nr_ptes(mm); /* * Ensure all pte setup (eg. pte page lock and page clearing) are * visible before the pte is made visible to other CPUs by being * put into page tables. * * The other side of the story is the pointer chasing in the page * table walking code (when walking the page table without locking; * ie. most of the time). Fortunately, these data accesses consist * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the * smp_rmb() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ pmd_populate(mm, pmd, *pte); *pte = NULL; } spin_unlock(ptl); } int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; pmd_install(mm, pmd, &new); if (new) pte_free(mm, new); return 0; } int __pte_alloc_kernel(pmd_t *pmd) { pte_t *new = pte_alloc_one_kernel(&init_mm); if (!new) return -ENOMEM; spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ smp_wmb(); /* See comment in pmd_install() */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); return 0; } static inline void init_rss_vec(int *rss) { memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); } static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); } /* * This function is called to print an error when a bad pte * is found. For example, we might have a PFN-mapped pte in * a region that doesn't allow it. * * The calling function must still handle the error. */ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page) { pgd_t *pgd = pgd_offset(vma->vm_mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. */ if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; return; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } nr_shown = 0; } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this * case, NULL is returned here. "Normal" mappings do have a struct page. * * There are 2 broad cases. Firstly, an architecture may define a pte_special() * pte bit, in which case this function is trivial. Secondly, an architecture * may not have a spare pte bit, which requires a more complicated scheme, * described below. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. * * The way we recognize COWed pages within VM_PFNMAP mappings is through the * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit * set, and the vm_pgoff will point to the first PFN mapped: thus every special * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) * * And for normal mappings this is false. * * This restricts such mappings to be a linear translation from virtual address * to pfn. To get around this restriction, we allow arbitrary mappings so long * as the vma is not a COW mapping; in that case, we know that all ptes are * special (because none can have been COWed). * * * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The only exception are zeropages, which are * *never* refcounted. * * The disadvantage is that pages are refcounted (which can be slower and * simply not an option for some PFNMAP users). The advantage is that we * don't have to follow the strict linearity rule of PFNMAP mappings in * order to support COWable mappings. * */ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long pfn = pte_pfn(pte); if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (is_zero_pfn(pfn)) return NULL; if (pte_devmap(pte)) /* * NOTE: New users of ZONE_DEVICE will not set pte_devmap() * and will have refcounts incremented on their struct pages * when they are inserted into PTEs, thus they are safe to * return here. Legacy ZONE_DEVICE pages that set pte_devmap() * do not have refcounts. Example of legacy ZONE_DEVICE is * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. */ return NULL; print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; if (is_zero_pfn(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (is_zero_pfn(pfn)) return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: VM_WARN_ON_ONCE(is_zero_pfn(pfn)); return pfn_to_page(pfn); } struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct page *page = vm_normal_page(vma, addr, pte); if (page) return page_folio(page); return NULL; } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long pfn = pmd_pfn(pmd); /* Currently it's only used for huge pfnmaps */ if (unlikely(pmd_special(pmd))) return NULL; if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (pmd_devmap(pmd)) return NULL; if (is_huge_zero_pmd(pmd)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) return NULL; /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: return pfn_to_page(pfn); } struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { struct page *page = vm_normal_page_pmd(vma, addr, pmd); if (page) return page_folio(page); return NULL; } #endif static void restore_exclusive_pte(struct vm_area_struct *vma, struct page *page, unsigned long address, pte_t *ptep) { struct folio *folio = page_folio(page); pte_t orig_pte; pte_t pte; swp_entry_t entry; orig_pte = ptep_get(ptep); pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); entry = pte_to_swp_entry(orig_pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) && PageAnonExclusive(page)), folio); /* * No need to take a page reference as one was already * created when the swap entry was made. */ if (folio_test_anon(folio)) folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE); else /* * Currently device exclusive access only supports anonymous * memory so the entry shouldn't point to a filebacked page. */ WARN_ON_ONCE(1); set_pte_at(vma->vm_mm, address, ptep, pte); /* * No need to invalidate - it was non-present before. However * secondary CPUs may have mappings that need invalidating. */ update_mmu_cache(vma, address, ptep); } /* * Tries to restore an exclusive pte if the page lock can be acquired without * sleeping. */ static int try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr) { swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); struct page *page = pfn_swap_entry_to_page(entry); if (trylock_page(page)) { restore_exclusive_pte(vma, page, addr, src_pte); unlock_page(page); return 0; } return -EBUSY; } /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. */ static unsigned long copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); pte_t pte = orig_pte; struct folio *folio; struct page *page; swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) return -EIO; /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); if (list_empty(&dst_mm->mmlist)) list_add(&dst_mm->mmlist, &src_mm->mmlist); spin_unlock(&mmlist_lock); } /* Mark the swap entry as shared. */ if (pte_swp_exclusive(orig_pte)) { pte = pte_swp_clear_exclusive(orig_pte); set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); rss[mm_counter(folio)]++; if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent and child * to be set to read. A previously exclusive entry is * now shared. */ entry = make_readable_migration_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); /* * Update rss count even for unaddressable pages, as * they should treated just like normal pages in this * respect. * * We will likely want to have some new rss counters * for unaddressable pages, at some point. But for now * keep things as they are. */ folio_get(folio); rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ folio_try_dup_anon_rmap_pte(folio, page, src_vma); /* * We do not preserve soft-dirty information, because so * far, checkpoint/restore is the only feature that * requires that. And checkpoint/restore does not work * when a device driver is involved (you cannot easily * save and restore device driver state). */ if (is_writable_device_private_entry(entry) && is_cow_mapping(vm_flags)) { entry = make_readable_device_private_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_exclusive_entry(entry)) { /* * Make device exclusive entries present by restoring the * original entry then copying as for a present pte. Device * exclusive entries currently only support private writable * (ie. COW) mappings. */ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); if (try_restore_exclusive_pte(src_pte, src_vma, addr)) return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { pte_marker marker = copy_pte_marker(entry, dst_vma); if (marker) set_pte_at(dst_mm, addr, dst_pte, make_pte_marker(marker)); return 0; } if (!userfaultfd_wp(dst_vma)) pte = pte_swp_clear_uffd_wp(pte); set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } /* * Copy a present and normal page. * * NOTE! The usual case is that this isn't required; * instead, the caller can just increase the page refcount * and re-use the pte the traditional way. * * And if we need a pre-allocated page but don't yet have * one, return a negative error to let the preallocation * code know so that it can do so outside the page table * lock. */ static inline int copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, struct folio **prealloc, struct page *page) { struct folio *new_folio; pte_t pte; new_folio = *prealloc; if (!new_folio) return -EAGAIN; /* * We have a prealloc page, all good! Take it * over and copy the page & arm it. */ if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma)) return -EHWPOISON; *prealloc = NULL; __folio_mark_uptodate(new_folio); folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int nr) { struct mm_struct *src_mm = src_vma->vm_mm; /* If it's a COW mapping, write protect it both processes. */ if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { wrprotect_ptes(src_mm, addr, src_pte, nr); pte = pte_wrprotect(pte); } /* If it's a shared mapping, mark it clean in the child. */ if (src_vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); } /* * Copy one present PTE, trying to batch-process subsequent PTEs that map * consecutive pages of the same folio by copying them as well. * * Returns -EAGAIN if one preallocated page is required to copy the next PTE. * Otherwise, returns the number of copied PTEs (at least 1). */ static inline int copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int max_nr, int *rss, struct folio **prealloc) { struct page *page; struct folio *folio; bool any_writable; fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); if (unlikely(!page)) goto copy_pte; folio = page_folio(page); /* * If we likely have to copy, just don't bother with batching. Make * sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { if (src_vma->vm_flags & VM_SHARED) flags |= FPB_IGNORE_DIRTY; if (!vma_soft_dirty_enabled(src_vma)) flags |= FPB_IGNORE_SOFT_DIRTY; nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, &any_writable, NULL, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, nr, src_vma))) { folio_ref_sub(folio, nr); return -EAGAIN; } rss[MM_ANONPAGES] += nr; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { folio_dup_file_rmap_ptes(folio, page, nr); rss[mm_counter_file(folio)] += nr; } if (any_writable) pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; } folio_get(folio); if (folio_test_anon(folio)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, page); return err ? err : 1; } rss[MM_ANONPAGES]++; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { folio_dup_file_rmap_pte(folio, page); rss[mm_counter_file(folio)]++; } copy_pte: __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1); return 1; } static inline struct folio *folio_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, unsigned long addr, bool need_zero) { struct folio *new_folio; if (need_zero) new_folio = vma_alloc_zeroed_movable_folio(vma, addr); else new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (!new_folio) return NULL; if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) { folio_put(new_folio); return NULL; } folio_throttle_swaprate(new_folio, GFP_KERNEL); return new_folio; } static int copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; pmd_t dummy_pmdval; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; struct folio *prealloc = NULL; int nr; again: progress = 0; init_rss_vec(rss); /* * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the * error handling here, assume that exclusive mmap_lock on dst and src * protects anon from unexpected THP transitions; with shmem and file * protected by mmap_lock-less collapse skipping areas with anon_vma * (whereas vma_needs_copy() skips areas without anon_vma). A rework * can remove such assumptions later, but this is good enough for now. */ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) { ret = -ENOMEM; goto out; } /* * We already hold the exclusive mmap_lock, the copy_pte_range() and * retract_page_tables() are using vma->anon_vma to be exclusive, so * the PTE page is stable, and there is no need to get pmdval and do * pmd_same() check. */ src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval, &src_ptl); if (!src_pte) { pte_unmap_unlock(dst_pte, dst_ptl); /* ret == 0 */ goto out; } spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; arch_enter_lazy_mmu_mode(); do { nr = 1; /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. */ if (progress >= 32) { progress = 0; if (need_resched() || spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } ptent = ptep_get(src_pte); if (pte_none(ptent)) { progress++; continue; } if (unlikely(!pte_present(ptent))) { ret = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (ret == -EIO) { entry = pte_to_swp_entry(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; } else if (!ret) { progress += 8; continue; } ptent = ptep_get(src_pte); VM_WARN_ON_ONCE(!pte_present(ptent)); /* * Device exclusive entry restored, continue by copying * the now present pte. */ WARN_ON_ONCE(ret != -ENOENT); } /* copy_present_ptes() will clear `*prealloc' if consumed */ max_nr = (end - addr) / PAGE_SIZE; ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, ptent, addr, max_nr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. * If copy failed due to hwpoison in source page, break out. */ if (unlikely(ret == -EAGAIN || ret == -EHWPOISON)) break; if (unlikely(prealloc)) { /* * pre-alloc page cannot be reused by next time so as * to strictly follow mempolicy (e.g., alloc_page_vma() * will allocate page according to address). This * could only happen if one pinned pte changed. */ folio_put(prealloc); prealloc = NULL; } nr = ret; progress += 8 * nr; } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (ret == -EIO) { VM_WARN_ON_ONCE(!entry.val); if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { ret = -ENOMEM; goto out; } entry.val = 0; } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) { goto out; } else if (ret == -EAGAIN) { prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; } else if (ret < 0) { VM_WARN_ON_ONCE(1); } /* We've captured and resolved the error. Reset, try again. */ ret = 0; if (addr != end) goto again; out: if (unlikely(prealloc)) folio_put(prealloc); return ret; } static inline int copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pmd_t *src_pmd, *dst_pmd; unsigned long next; dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); if (!dst_pmd) return -ENOMEM; src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pud_t *src_pud, *dst_pud; unsigned long next; dst_pud = pud_alloc(dst_mm, dst_p4d, addr); if (!dst_pud) return -ENOMEM; src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); err = copy_huge_pud(dst_mm, src_mm, dst_pud, src_pud, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } static inline int copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; p4d_t *src_p4d, *dst_p4d; unsigned long next; dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); if (!dst_p4d) return -ENOMEM; src_p4d = p4d_offset(src_pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } /* * Return true if the vma needs to copy the pgtable during this fork(). Return * false when we can speed up fork() by allowing lazy page faults later until * when the child accesses the memory range. */ static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { /* * Always copy pgtables when dst_vma has uffd-wp enabled even if it's * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable * contains uffd-wp protection information, that's something we can't * retrieve from page cache, and skip copying will lose those info. */ if (userfaultfd_wp(dst_vma)) return true; if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return true; if (src_vma->anon_vma) return true; /* * Don't copy ptes where a page fault will fill them correctly. Fork * becomes much lighter when there are big shared or private readonly * mappings. The tradeoff is that copy_page_range is more efficient * than faulting. */ return false; } int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; bool is_cow; int ret; if (!vma_needs_copy(dst_vma, src_vma)) return 0; if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ ret = track_pfn_copy(src_vma); if (ret) return ret; } /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ is_cow = is_cow_mapping(src_vma->vm_flags); if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); /* * Disabling preemption is not needed for the write side, as * the read side doesn't spin, but goes to the mmap_lock. * * Use the raw variant of the seqcount_t write API to avoid * lockdep complaining about preemptibility. */ vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src_mm->write_protect_seq); } ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { untrack_pfn_clear(dst_vma); ret = -ENOMEM; break; } } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow) { raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } return ret; } /* Whether we should zap all COWed (private) pages too */ static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ if (!details) return true; /* Or, we zap COWed pages only if the caller wants to */ return details->even_cows; } /* Decides whether we should zap this folio with the folio pointer specified */ static inline bool should_zap_folio(struct zap_details *details, struct folio *folio) { /* If we can make a decision without *folio.. */ if (should_zap_cows(details)) return true; /* Otherwise we should only zap non-anon folios */ return !folio_test_anon(folio); } static inline bool zap_drop_markers(struct zap_details *details) { if (!details) return false; return details->zap_flags & ZAP_FLAG_DROP_MARKER; } /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. */ static inline void zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) return; if (zap_drop_markers(details)) return; for (;;) { /* the PFN in the PTE is irrelevant. */ pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); if (--nr == 0) break; pte++; addr += PAGE_SIZE; } } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, bool *force_break) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; if (!folio_test_anon(folio)) { ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); if (pte_dirty(ptent)) { folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { delay_rmap = true; *force_flush = true; } } if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); rss[mm_counter(folio)] -= nr; } else { /* We don't need up-to-date accessed/dirty bits. */ clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); rss[MM_ANONPAGES] -= nr; } /* Checking a single PTE in a batch is sufficient. */ arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); if (unlikely(folio_mapcount(folio) < 0)) print_bad_pte(vma, addr, ptent, page); } if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; } } /* * Zap or skip at least one present PTE, trying to batch-process subsequent * PTEs that map consecutive pages of the same folio. * * Returns the number of processed (skipped or zapped) PTEs (at least 1). */ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, bool *force_break) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; int nr; page = vm_normal_page(vma, addr, ptent); if (!page) { /* We don't need up-to-date accessed/dirty bits. */ ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) return 1; /* * Make sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, NULL, NULL, NULL); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, force_break); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, details, rss, force_flush, force_break); return 1; } static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { bool force_flush = false, force_break = false; struct mm_struct *mm = tlb->mm; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; swp_entry_t entry; int nr; tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return addr; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { pte_t ptent = ptep_get(pte); struct folio *folio; struct page *page; int max_nr; nr = 1; if (pte_none(ptent)) continue; if (need_resched()) break; if (pte_present(ptent)) { max_nr = (end - addr) / PAGE_SIZE; nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, details, rss, &force_flush, &force_break); if (unlikely(force_break)) { addr += nr * PAGE_SIZE; break; } continue; } entry = pte_to_swp_entry(ptent); if (is_device_private_entry(entry) || is_device_exclusive_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) continue; /* * Both device private/exclusive mappings should only * work with anonymous page so far, so we don't need to * consider uffd-wp bit when zap. For more information, * see zap_install_uffd_wp_if_needed(). */ WARN_ON_ONCE(!vma_is_anonymous(vma)); rss[mm_counter(folio)]--; if (is_device_private_entry(entry)) folio_remove_rmap_pte(folio, page, vma); folio_put(folio); } else if (!non_swap_entry(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) continue; rss[MM_SWAPENTS] -= nr; free_swap_and_cache_nr(entry, nr); } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); if (!should_zap_folio(details, folio)) continue; rss[mm_counter(folio)]--; } else if (pte_marker_entry_uffd_wp(entry)) { /* * For anon: always drop the marker; for file: only * drop the marker if explicitly requested. */ if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) continue; } else if (is_guard_swp_entry(entry)) { /* * Ordinary zapping should not remove guard PTE * markers. Only do so if we should remove PTE markers * in general. */ if (!zap_drop_markers(details)) continue; } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { if (!should_zap_cows(details)) continue; } else { /* We should have covered all the swap entry types */ pr_alert("unrecognized swap entry 0x%lx\n", entry.val); WARN_ON_ONCE(1); } clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_rmaps(tlb, vma); } pte_unmap_unlock(start_pte, ptl); /* * If we forced a TLB flush (either due to running out of * batch buffers or because we needed to flush dirty TLB * entries before releasing the ptl), free the batched * memory too. Come back again if we didn't do everything. */ if (force_flush) tlb_flush_mmu(tlb); return addr; } static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, struct zap_details *details) { pmd_t *pmd; unsigned long next; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { addr = next; continue; } /* fall through */ } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { spinlock_t *ptl = pmd_lock(tlb->mm, pmd); /* * Take and drop THP pmd lock so that we cannot return * prematurely, while zap_huge_pmd() has cleared *pmd, * but not yet decremented compound_mapcount(). */ spin_unlock(ptl); } if (pmd_none(*pmd)) { addr = next; continue; } addr = zap_pte_range(tlb, vma, pmd, addr, next, details); if (addr != next) pmd--; } while (pmd++, cond_resched(), addr != end); return addr; } static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, struct zap_details *details) { pud_t *pud; unsigned long next; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*pud) || pud_devmap(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); } else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; /* fall through */ } if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); } while (pud++, addr = next, addr != end); return addr; } static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, struct zap_details *details) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); return addr; } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) { pgd_t *pgd; unsigned long next; BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details, bool mm_wr_locked) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; if (start >= vma->vm_end) return; end = min(vma->vm_end, end_addr); if (end <= vma->vm_start) return; if (vma->vm_file) uprobe_munmap(vma, start, end); if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn(vma, 0, 0, mm_wr_locked); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. */ if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; __unmap_hugepage_range(tlb, vma, start, end, NULL, zap_flags); } } else unmap_page_range(tlb, vma, start, end, details); } } /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather * @mas: the maple state * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @tree_end: The maximum index to check * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. * * unmap_vmas() assumes that the caller will flush the whole unmapped address * range after unmap_vmas() returns. So the only responsibility here is to * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long tree_end, bool mm_wr_locked) { struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, /* Careful - we need to zap private pages too! */ .even_cows = true, }; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { unsigned long start = start_addr; unsigned long end = end_addr; hugetlb_zap_begin(vma, &start, &end); unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } /** * zap_page_range_single - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap * @details: details of shared cache invalidation * * The range must fit into one VMA. */ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; struct mmu_gather tlb; lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ unmap_single_vma(&tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); hugetlb_zap_end(vma, details); } /** * zap_vma_ptes - remove ptes mapping the vma * @vma: vm_area_struct holding ptes to be zapped * @address: starting address of pages to zap * @size: number of bytes to zap * * This function only unmaps ptes assigned to VM_PFNMAP vmas. * * The entire address range must be fully contained within the vma. * */ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (!range_in_vma(vma, address, address + size) || !(vma->vm_flags & VM_PFNMAP)) return; zap_page_range_single(vma, address, size, NULL); } EXPORT_SYMBOL_GPL(zap_vma_ptes); static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, addr); if (!pud) return NULL; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); return pmd; } pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pmd_t *pmd = walk_to_pmd(mm, addr); if (!pmd) return NULL; return pte_alloc_map_lock(mm, pmd, addr, ptl); } static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma) { VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); /* * Whoever wants to forbid the zeropage after some zeropages * might already have been mapped has to scan the page tables and * bail out on any zeropages. Zeropages in COW mappings can * be unshared using FAULT_FLAG_UNSHARE faults. */ if (mm_forbids_zeropage(vma->vm_mm)) return false; /* zeropages in COW mappings are common and unproblematic. */ if (is_cow_mapping(vma->vm_flags)) return true; /* Mappings that do not allow for writable PTEs are unproblematic. */ if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) return true; /* * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could * find the shared zeropage and longterm-pin it, which would * be problematic as soon as the zeropage gets replaced by a different * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would * now differ to what GUP looked up. FSDAX is incompatible to * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see * check_vma_flags). */ return vma->vm_ops && vma->vm_ops->pfn_mkwrite && (vma_is_fsdax(vma) || vma->vm_flags & VM_IO); } static int validate_page_before_insert(struct vm_area_struct *vma, struct page *page) { struct folio *folio = page_folio(page); if (!folio_ref_count(folio)) return -EINVAL; if (unlikely(is_zero_folio(folio))) { if (!vm_mixed_zeropage_allowed(vma)) return -EINVAL; return 0; } if (folio_test_anon(folio) || folio_test_slab(folio) || page_has_type(page)) return -EINVAL; flush_dcache_folio(folio); return 0; } static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { struct folio *folio = page_folio(page); pte_t pteval; if (!pte_none(ptep_get(pte))) return -EBUSY; /* Ok, finally just insert the thing.. */ pteval = mk_pte(page, prot); if (unlikely(is_zero_folio(folio))) { pteval = pte_mkspecial(pteval); } else { folio_get(folio); inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); } set_pte_at(vma->vm_mm, addr, pte, pteval); return 0; } static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { int retval; pte_t *pte; spinlock_t *ptl; retval = validate_page_before_insert(vma, page); if (retval) goto out; retval = -ENOMEM; pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); pte_unmap_unlock(pte, ptl); out: return retval; } static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; err = validate_page_before_insert(vma, page); if (err) return err; return insert_page_into_pte_locked(vma, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations * when inserting pages in a loop. */ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) { pmd_t *pmd = NULL; pte_t *start_pte, *pte; spinlock_t *pte_lock; struct mm_struct *const mm = vma->vm_mm; unsigned long curr_page_idx = 0; unsigned long remaining_pages_total = *num; unsigned long pages_to_write_in_pmd; int ret; more: ret = -EFAULT; pmd = walk_to_pmd(mm, addr); if (!pmd) goto out; pages_to_write_in_pmd = min_t(unsigned long, remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); /* Allocate the PTE if necessary; takes PMD lock once only. */ ret = -ENOMEM; if (pte_alloc(mm, pmd)) goto out; while (pages_to_write_in_pmd) { int pte_idx = 0; const int batch_size = min_t(int, pages_to_write_in_pmd, 8); start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); if (!start_pte) { ret = -EFAULT; goto out; } for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); ret = err; remaining_pages_total -= pte_idx; goto out; } addr += PAGE_SIZE; ++curr_page_idx; } pte_unmap_unlock(start_pte, pte_lock); pages_to_write_in_pmd -= batch_size; remaining_pages_total -= batch_size; } if (remaining_pages_total) goto more; ret = 0; out: *num = remaining_pages_total; return ret; } /** * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. * @vma: user vma to map to * @addr: target start user address of these pages * @pages: source kernel pages * @num: in: number of pages to map. out: number of pages that were *not* * mapped. (0 means all pages were successfully mapped). * * Preferred over vm_insert_page() when inserting multiple pages. * * In case of error, we may have mapped a subset of the provided * pages. It is the caller's responsibility to account for this case. * * The same restrictions apply as in vm_insert_page(). */ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; if (addr < vma->vm_start || end_addr >= vma->vm_end) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_pages); /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to * @addr: target user address of this page * @page: source kernel page * * This allows drivers to insert individual pages they've allocated * into a user vma. The zeropage is supported in some VMAs, * see vm_mixed_zeropage_allowed(). * * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow * that. Your vma protection will have to be set up correctly, which * means that if you want a shared writable mapping, you'd better * ask for a shared writable mapping! * * The page does not need to be reserved. * * Usually this function is called from f_op->mmap() handler * under mm->mmap_lock write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. * * Return: %0 on success, negative error code otherwise. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); /* * __vm_map_pages - maps range of kernel pages into user vma * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * @offset: user's requested vm_pgoff * * This allows drivers to map range of kernel pages into a user vma. * The zeropage is supported in some VMAs, see * vm_mixed_zeropage_allowed(). * * Return: 0 on success and error code otherwise. */ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num, unsigned long offset) { unsigned long count = vma_pages(vma); unsigned long uaddr = vma->vm_start; int ret, i; /* Fail if the user requested offset is beyond the end of the object */ if (offset >= num) return -ENXIO; /* Fail if the user requested size exceeds available object size */ if (count > num - offset) return -ENXIO; for (i = 0; i < count; i++) { ret = vm_insert_page(vma, uaddr, pages[offset + i]); if (ret < 0) return ret; uaddr += PAGE_SIZE; } return 0; } /** * vm_map_pages - maps range of kernel pages starts with non zero offset * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * * Maps an object consisting of @num pages, catering for the user's * requested vm_pgoff * * If we fail to insert any page into the vma, the function will return * immediately leaving any previously inserted pages present. Callers * from the mmap handler may immediately return the error as their caller * will destroy the vma, removing any successfully inserted pages. Other * callers should make their own arrangements for calling unmap_region(). * * Context: Process context. Called by mmap handlers. * Return: 0 on success and error code otherwise. */ int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num) { return __vm_map_pages(vma, pages, num, vma->vm_pgoff); } EXPORT_SYMBOL(vm_map_pages); /** * vm_map_pages_zero - map range of kernel pages starts with zero offset * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * * Similar to vm_map_pages(), except that it explicitly sets the offset * to 0. This function is intended for the drivers that did not consider * vm_pgoff. * * Context: Process context. Called by mmap handlers. * Return: 0 on success and error code otherwise. */ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num) { return __vm_map_pages(vma, pages, num, 0); } EXPORT_SYMBOL(vm_map_pages_zero); static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, entry; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; entry = ptep_get(pte); if (!pte_none(entry)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared * mapping and we expect the PFNs to match. If they * don't match, we are likely racing with block * allocation and mapping invalidation so just skip the * update. */ if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); } goto out_unlock; } /* Ok, finally just insert the thing.. */ if (pfn_t_devmap(pfn)) entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); } set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ out_unlock: pte_unmap_unlock(pte, ptl); return VM_FAULT_NOPAGE; } /** * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * * This is exactly like vmf_insert_pfn(), except that it allows drivers * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for * COW mappings. In general, using multiple vmas is preferable; * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. * * pgprot typically only differs from @vma->vm_page_prot when drivers set * caching- and encryption bits different than those of @vma->vm_page_prot, * because the caching- or encryption mode may not be known at mmap() time. * * This is ok as long as @vma->vm_page_prot is not used by the core vm * to set caching and encryption bits for those vmas (except for COW pages). * This is ensured by core vm only modifying these page table entries using * functions that don't touch caching- or encryption bits, using pte_modify() * if needed. (See for example mprotect()). * * Also when new page-table entries are created, this is only done using the * fault() callback, and never using the value of vma->vm_page_prot, * except for page-table entries that point to anonymous pages as the result * of COW. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like * consistency in testing and feature parity among all, so we should * try to keep these invariants in place for everybody. */ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); } EXPORT_SYMBOL(vmf_insert_pfn_prot); /** * vmf_insert_pfn - insert single pfn into user vma * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * * Similar to vm_insert_page, this allows drivers to insert individual pages * they've allocated into a user vma. Same comments apply. * * This function should only be called from a vm_ops->fault handler, and * in that case the handler should return the result of this function. * * vma cannot be a COW mapping. * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); } EXPORT_SYMBOL(vmf_insert_pfn); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite) { if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) && (mkwrite || !vm_mixed_zeropage_allowed(vma))) return false; /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; if (pfn_t_devmap(pfn)) return true; if (pfn_t_special(pfn)) return true; if (is_zero_pfn(pfn_t_to_pfn(pfn))) return true; return false; } static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; int err; if (!vm_mixed_ok(vma, pfn, mkwrite)) return VM_FAULT_SIGBUS; if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) return VM_FAULT_SIGBUS; /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* * refcount the page if pfn_valid is true (hence insert_page rather * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* * At this point we are committed to insert_page() * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); err = insert_page(vma, addr, page, pgprot); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, false); } EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, true); } /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pte_t *pte, *mapped_pte; spinlock_t *ptl; int err = 0; mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; } set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(mapped_pte, ptl); return err; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pmd_t *pmd; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pmd++, addr = next, addr != end); return 0; } static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pud_t *pud; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); err = remap_pmd_range(mm, pud, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pud++, addr = next, addr != end); return 0; } static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { p4d_t *p4d; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); err = remap_pud_range(mm, p4d, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (p4d++, addr = next, addr != end); return 0; } static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; int err; if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; /* * Physically remapped pages are special. Tell the * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. * VM_DONTEXPAND * Disable vma merging and expanding with mremap(). * VM_DONTDUMP * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". * See vm_normal_page() for details. */ if (is_cow_mapping(vma->vm_flags)) { if (addr != vma->vm_start || end != vma->vm_end) return -EINVAL; vma->vm_pgoff = pfn; } vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); do { next = pgd_addr_end(addr, end); err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pgd++, addr = next, addr != end); return 0; } /* * Variant of remap_pfn_range that does not call track_pfn_remap. The caller * must have pre-validated the caching bits of the pgprot_t. */ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); if (!error) return 0; /* * A partial pfn range mapping is dangerous: it does not * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ zap_page_range_single(vma, addr, size, NULL); return error; } /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to * @addr: target page aligned user address to start at * @pfn: page frame number of kernel physical memory address * @size: size of mapping area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. * * Return: %0 on success, negative error code otherwise. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int err; err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); if (err) return -EINVAL; err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); if (err) untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); return err; } EXPORT_SYMBOL(remap_pfn_range); /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to * @start: start of the physical memory to be mapped * @len: size of area * * This is a simplified io_remap_pfn_range() for common driver use. The * driver just needs to give us the physical memory range to be mapped, * we'll figure out the rest from the vma information. * * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get * whatever write-combining details or similar. * * Return: %0 on success, negative error code otherwise. */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { unsigned long vm_len, pfn, pages; /* Check that the physical memory area passed in looks valid */ if (start + len < start) return -EINVAL; /* * You *really* shouldn't map things that aren't page-aligned, * but we've historically allowed it because IO memory might * just have smaller alignment. */ len += start & ~PAGE_MASK; pfn = start >> PAGE_SHIFT; pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; if (pfn + pages < pfn) return -EINVAL; /* We start the mapping 'vm_pgoff' pages into the area */ if (vma->vm_pgoff > pages) return -EINVAL; pfn += vma->vm_pgoff; pages -= vma->vm_pgoff; /* Can we fit all of the mapping? */ vm_len = vma->vm_end - vma->vm_start; if (vm_len >> PAGE_SHIFT > pages) return -EINVAL; /* Ok, let it rip */ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); } EXPORT_SYMBOL(vm_iomap_memory); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pte_t *pte, *mapped_pte; int err = 0; spinlock_t *ptl; if (create) { mapped_pte = pte = (mm == &init_mm) ? pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; } else { mapped_pte = pte = (mm == &init_mm) ? pte_offset_kernel(pmd, addr) : pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return -EINVAL; } arch_enter_lazy_mmu_mode(); if (fn) { do { if (create || !pte_none(ptep_get(pte))) { err = fn(pte++, addr, data); if (err) break; } } while (addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; arch_leave_lazy_mmu_mode(); if (mm != &init_mm) pte_unmap_unlock(mapped_pte, ptl); return err; } static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; int err = 0; BUG_ON(pud_leaf(*pud)); if (create) { pmd = pmd_alloc_track(mm, pud, addr, mask); if (!pmd) return -ENOMEM; } else { pmd = pmd_offset(pud, addr); } do { next = pmd_addr_end(addr, end); if (pmd_none(*pmd) && !create) continue; if (WARN_ON_ONCE(pmd_leaf(*pmd))) return -EINVAL; if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) { if (!create) continue; pmd_clear_bad(pmd); } err = apply_to_pte_range(mm, pmd, addr, next, fn, data, create, mask); if (err) break; } while (pmd++, addr = next, addr != end); return err; } static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int err = 0; if (create) { pud = pud_alloc_track(mm, p4d, addr, mask); if (!pud) return -ENOMEM; } else { pud = pud_offset(p4d, addr); } do { next = pud_addr_end(addr, end); if (pud_none(*pud) && !create) continue; if (WARN_ON_ONCE(pud_leaf(*pud))) return -EINVAL; if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) { if (!create) continue; pud_clear_bad(pud); } err = apply_to_pmd_range(mm, pud, addr, next, fn, data, create, mask); if (err) break; } while (pud++, addr = next, addr != end); return err; } static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; int err = 0; if (create) { p4d = p4d_alloc_track(mm, pgd, addr, mask); if (!p4d) return -ENOMEM; } else { p4d = p4d_offset(pgd, addr); } do { next = p4d_addr_end(addr, end); if (p4d_none(*p4d) && !create) continue; if (WARN_ON_ONCE(p4d_leaf(*p4d))) return -EINVAL; if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) { if (!create) continue; p4d_clear_bad(p4d); } err = apply_to_pud_range(mm, p4d, addr, next, fn, data, create, mask); if (err) break; } while (p4d++, addr = next, addr != end); return err; } static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data, bool create) { pgd_t *pgd; unsigned long start = addr, next; unsigned long end = addr + size; pgtbl_mod_mask mask = 0; int err = 0; if (WARN_ON(addr >= end)) return -EINVAL; pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) continue; if (WARN_ON_ONCE(pgd_leaf(*pgd))) return -EINVAL; if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { if (!create) continue; pgd_clear_bad(pgd); } err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, start + size); return err; } /* * Scan a region of virtual memory, filling in page tables as necessary * and calling a provided function on each leaf page table. */ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { return __apply_to_page_range(mm, addr, size, fn, data, true); } EXPORT_SYMBOL_GPL(apply_to_page_range); /* * Scan a region of virtual memory, calling a provided function on * each leaf page table where it exists. * * Unlike apply_to_page_range, this does _not_ fill in page tables * where they are absent. */ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { return __apply_to_page_range(mm, addr, size, fn, data, false); } EXPORT_SYMBOL_GPL(apply_to_existing_page_range); /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures * or configurations (e.g. i386 with PAE) which might give a mix of unmatched * parts, do_swap_page must check under lock before unmapping the pte and * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct vm_fault *vmf) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spin_lock(vmf->ptl); same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); spin_unlock(vmf->ptl); } #endif pte_unmap(vmf->pte); vmf->pte = NULL; return same; } /* * Return: * 0: copied succeeded * -EHWPOISON: copy failed due to hwpoison in source page * -EAGAIN: copied failed (some other reason) */ static inline int __wp_page_copy_user(struct page *dst, struct page *src, struct vm_fault *vmf) { int ret; void *kaddr; void __user *uaddr; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; if (likely(src)) { if (copy_mc_user_highpage(dst, src, addr, vma)) return -EHWPOISON; return 0; } /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ kaddr = kmap_local_page(dst); pagefault_disable(); uaddr = (void __user *)(addr & PAGE_MASK); /* * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ vmf->pte = NULL; if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } entry = pte_mkyoung(vmf->orig_pte); if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); } /* * This really shouldn't fail, because the page is there * in the page tables. But it might just be unreadable, * in which case we just give up and fill the result with * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { if (vmf->pte) goto warn; /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } /* * The same page can be mapped back since last copy attempt. * Try to copy again under PTL. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { /* * Give a warn in case there can be some obscure * use-case */ warn: WARN_ON_ONCE(1); clear_page(kaddr); } } ret = 0; pte_unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); pagefault_enable(); kunmap_local(kaddr); flush_dcache_page(dst); return ret; } static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) { struct file *vm_file = vma->vm_file; if (vm_file) return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; /* * Special mappings (e.g. VDSO) do not have any file so fake * a default GFP_KERNEL for them. */ return GFP_KERNEL; } /* * Notify the address space that the page is about to become writable so that * it can prohibit this or wait for the page to get into an appropriate state. * * We do this without the lock held, so that it can sleep if it needs to. */ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio) { vm_fault_t ret; unsigned int old_flags = vmf->flags; vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; if (vmf->vma->vm_file && IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) return VM_FAULT_SIGBUS; ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { folio_lock(folio); if (!folio->mapping) { folio_unlock(folio); return 0; /* retry */ } ret |= VM_FAULT_LOCKED; } else VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); return ret; } /* * Handle dirtying of a page in shared file mapping on a write fault. * * The function expects the page to be locked and unlocks it. */ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; struct folio *folio = page_folio(vmf->page); bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; dirtied = folio_mark_dirty(folio); VM_BUG_ON_FOLIO(folio_test_anon(folio), folio); /* * Take a local copy of the address_space - folio.mapping may be zeroed * by truncate after folio_unlock(). The address_space itself remains * pinned by vma->vm_file's reference. We rely on folio_unlock()'s * release semantics to prevent the compiler from undoing this copying. */ mapping = folio_raw_mapping(folio); folio_unlock(folio); if (!page_mkwrite) file_update_time(vma->vm_file); /* * Throttle page dirtying rate down to writeback speed. * * mapping may be NULL here because some device drivers do not * set page.mapping but still dirty their pages * * Drop the mmap_lock before waiting on IO, if we can. The file * is pinning the mapping, as per above. */ if ((dirtied || page_mkwrite) && mapping) { struct file *fpin; fpin = maybe_unlock_mmap_for_io(vmf, NULL); balance_dirty_pages_ratelimited(mapping); if (fpin) { fput(fpin); return VM_FAULT_COMPLETED; } } return 0; } /* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, * or due to us being the last reference standing to the page. In either * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte))); if (folio) { VM_BUG_ON(folio_test_anon(folio) && !PageAnonExclusive(vmf->page)); /* * Clear the folio's cpupid information as the existing * information potentially belongs to a now completely * unrelated process. */ folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); pte_unmap_unlock(vmf->pte, vmf->ptl); count_vm_event(PGREUSE); } /* * We could add a bitflag somewhere, but for now, we know that all * vm_ops that have a ->map_pages have been audited and don't need * the mmap_lock to be held. */ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) return 0; vma_end_read(vma); return VM_FAULT_RETRY; } /** * __vmf_anon_prepare - Prepare to handle an anonymous fault. * @vmf: The vm_fault descriptor passed from the fault handler. * * When preparing to insert an anonymous page into a VMA from a * fault handler, call this function rather than anon_vma_prepare(). * If this vma does not already have an associated anon_vma and we are * only protected by the per-VMA lock, the caller must retry with the * mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to * determine if this VMA can share its anon_vma, and that's not safe to * do with only the per-VMA lock held for this VMA. * * Return: 0 if fault handling can proceed. Any other value should be * returned to the caller. */ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { if (!mmap_read_trylock(vma->vm_mm)) return VM_FAULT_RETRY; } if (__anon_vma_prepare(vma)) ret = VM_FAULT_OOM; if (vmf->flags & FAULT_FLAG_VMA_LOCK) mmap_read_unlock(vma->vm_mm); return ret; } /* * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. * * Called with mmap_lock locked and the old page referenced, but * without the ptl held. * * High level logic flow: * * - Allocate a page, copy the content of the old page to the new one. * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. * - Take the PTL. If the pte changed, bail out and release the allocated page * - If the pte is still the way we remember it, update the page table and all * relevant references. This includes dropping the reference the page-table * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ static vm_fault_t wp_page_copy(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct folio *old_folio = NULL; struct folio *new_folio = NULL; pte_t entry; int page_copied = 0; struct mmu_notifier_range range; vm_fault_t ret; bool pfn_is_zero; delayacct_wpcopy_start(); if (vmf->page) old_folio = page_folio(vmf->page); ret = vmf_anon_prepare(vmf); if (unlikely(ret)) goto out; pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte)); new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero); if (!new_folio) goto oom; if (!pfn_is_zero) { int err; err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); if (err) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. * The -EHWPOISON case will not be retried. */ folio_put(new_folio); if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(&new_folio->page, vmf->page); } __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); /* * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } } else { ksm_might_unmap_zero_page(mm, vmf->orig_pte); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(&new_folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) entry = pte_mksoft_dirty(entry); if (pte_uffd_wp(vmf->orig_pte)) entry = pte_mkuffd_wp(entry); } else { entry = maybe_mkwrite(pte_mkdirty(entry), vma); } /* * Clear the pte entry and flush it first, before updating the * pte with the new entry, to keep TLBs on different CPUs in * sync. This code used to set the new PTE then flush TLBs, but * that left a window where the new PTE could be loaded into * some TLBs while the old PTE remains in others. */ ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); BUG_ON(unshare && pte_write(entry)); set_pte_at(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * folio_remove_rmap_pte() with the ptp_clear_flush * above. Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in folio_remove_rmap_pte(); * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ folio_remove_rmap_pte(old_folio, vmf->page, vma); } /* Free the old page.. */ new_folio = old_folio; page_copied = 1; pte_unmap_unlock(vmf->pte, vmf->ptl); } else if (vmf->pte) { update_mmu_tlb(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); } mmu_notifier_invalidate_range_end(&range); if (new_folio) folio_put(new_folio); if (old_folio) { if (page_copied) free_swap_cache(old_folio); folio_put(old_folio); } delayacct_wpcopy_end(); return 0; oom: ret = VM_FAULT_OOM; out: if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); return ret; } /** * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE * writeable once the page is prepared * * @vmf: structure describing the fault * @folio: the folio of vmf->page * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. * It handles locking of PTE and modifying it. * * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). * * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) return VM_FAULT_NOPAGE; /* * We might have raced with another page fault while we released the * pte_offset_map_lock. */ if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } wp_page_reuse(vmf, folio); return 0; } /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); ret = vmf_can_call_fault(vmf); if (ret) return ret; vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; return finish_mkwrite_fault(vmf, NULL); } wp_page_reuse(vmf, NULL); return 0; } static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; folio_get(folio); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = vmf_can_call_fault(vmf); if (tmp) { folio_put(folio); return tmp; } tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); return tmp; } tmp = finish_mkwrite_fault(vmf, folio); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { folio_unlock(folio); folio_put(folio); return tmp; } } else { wp_page_reuse(vmf, folio); folio_lock(folio); } ret |= fault_dirty_shared_page(vmf); folio_put(folio); return ret; } static bool wp_can_reuse_anon_folio(struct folio *folio, struct vm_area_struct *vma) { /* * We could currently only reuse a subpage of a large folio if no * other subpages of the large folios are still mapped. However, * let's just consistently not reuse subpages even if we could * reuse in that scenario, and give back a large folio a bit * sooner. */ if (folio_test_large(folio)) return false; /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing * the swapcache if there is little hope that we can reuse. * * KSM doesn't necessarily raise the folio refcount. */ if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) return false; if (!folio_test_lru(folio)) /* * We cannot easily detect+handle references from * remote LRU caches or references to LRU folios. */ lru_add_drain(); if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) return false; if (!folio_trylock(folio)) return false; if (folio_test_swapcache(folio)) folio_free_swap(folio); if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { folio_unlock(folio); return false; } /* * Ok, we've got the only folio reference from our mapping * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ folio_move_anon_rmap(folio, vma); folio_unlock(folio); return true; } /* * This routine handles present pages, when * * users try to write to a shared page (FAULT_FLAG_WRITE) * * GUP wants to take a R/O pin on a possibly shared anonymous page * (FAULT_FLAG_UNSHARE) * * It is done by copying the page to a new address and decrementing the * shared-page counter for the old page. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've * done any necessary COW. * * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even * though the page will change only once the write actually happens. This * avoids a few races, and potentially makes it more efficient. * * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; pte_t pte; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { if (!userfaultfd_wp_async(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } /* * Nothing needed (cache flush, TLB invalidations, * etc.) because we're only removing the uffd-wp bit, * which is completely invisible to the user. */ pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); /* * Update this to be prepared for following up CoW * handling */ vmf->orig_pte = pte; } /* * Userfaultfd write-protect can defer flushes. Ensure the TLB * is flushed in this case before copying. */ if (unlikely(userfaultfd_wp(vmf->vma) && mm_tlb_flush_pending(vmf->vma->vm_mm))) flush_tlb_page(vmf->vma, vmf->address); } vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (vmf->page) folio = page_folio(vmf->page); /* * Shared mapping: we are guaranteed to have VM_WRITE and * FAULT_FLAG_WRITE set at this point. */ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ if (!vmf->page) return wp_pfn_shared(vmf); return wp_page_shared(vmf, folio); } /* * Private mapping: create an exclusive anonymous page copy if reuse * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. * * If we encounter a page that is marked exclusive, we must reuse * the page without further checks. */ if (folio && folio_test_anon(folio) && (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) { if (!PageAnonExclusive(vmf->page)) SetPageAnonExclusive(vmf->page); if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } wp_page_reuse(vmf, folio); return 0; } /* * Ok, we need to copy. Oh, well.. */ if (folio) folio_get(folio); pte_unmap_unlock(vmf->pte, vmf->ptl); #ifdef CONFIG_KSM if (folio && folio_test_ksm(folio)) count_vm_event(COW_KSM); #endif return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, pgoff_t first_index, pgoff_t last_index, struct zap_details *details) { struct vm_area_struct *vma; pgoff_t vba, vea, zba, zea; vma_interval_tree_foreach(vma, root, first_index, last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; zba = max(first_index, vba); zea = min(last_index, vea); unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, details); } } /** * unmap_mapping_folio() - Unmap single folio from processes. * @folio: The locked folio to be unmapped. * * Unmap this folio from any userspace process which still has it mmaped. * Typically, for efficiency, the range of nearby pages has already been * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once * truncation or invalidation holds the lock on a folio, it may find that * the page has been remapped again: and then uses unmap_mapping_folio() * to unmap it finally. */ void unmap_mapping_folio(struct folio *folio) { struct address_space *mapping = folio->mapping; struct zap_details details = { }; pgoff_t first_index; pgoff_t last_index; VM_BUG_ON(!folio_test_locked(folio)); first_index = folio->index; last_index = folio_next_index(folio) - 1; details.even_cows = false; details.single_folio = folio; details.zap_flags = ZAP_FLAG_DROP_MARKER; i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); i_mmap_unlock_read(mapping); } /** * unmap_mapping_pages() - Unmap pages from processes. * @mapping: The address space containing pages to be unmapped. * @start: Index of first page to be unmapped. * @nr: Number of pages to be unmapped. 0 to unmap to end of file. * @even_cows: Whether to unmap even private COWed pages. * * Unmap the pages in this address space from any userspace process which * has them mmaped. Generally, you want to remove COWed pages as well when * a file is being truncated, but not when invalidating pages from the page * cache. */ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { struct zap_details details = { }; pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; details.even_cows = even_cows; if (last_index < first_index) last_index = ULONG_MAX; i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); i_mmap_unlock_read(mapping); } EXPORT_SYMBOL_GPL(unmap_mapping_pages); /** * unmap_mapping_range - unmap the portion of all mmaps in the specified * address_space corresponding to the specified byte range in the underlying * file. * * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded * up to a PAGE_SIZE boundary. A holelen of zero truncates to the * end of the file. * @even_cows: 1 when truncating a file, unmap even private COWed pages; * but 0 when invalidating pagecache, don't throw away private data. */ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT; pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { long long holeend = (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; } unmap_mapping_pages(mapping, hba, hlen, even_cows); } EXPORT_SYMBOL(unmap_mapping_range); /* * Restore a potential device exclusive pte to a working pte entry */ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; vm_fault_t ret; /* * We need a reference to lock the folio because we don't hold * the PTL so a racing thread can remove the device-exclusive * entry and unmap it. If the folio is free the entry must * have been removed already. If it happens to have already * been re-allocated after being freed all we do is lock and * unlock it. */ if (!folio_try_get(folio)) return 0; ret = folio_lock_or_retry(folio, vmf); if (ret) { folio_put(folio); return ret; } mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); folio_unlock(folio); folio_put(folio); mmu_notifier_invalidate_range_end(&range); return 0; } static inline bool should_try_to_free_swap(struct folio *folio, struct vm_area_struct *vma, unsigned int fault_flags) { if (!folio_test_swapcache(folio)) return false; if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || folio_test_mlocked(folio)) return true; /* * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * user. Try freeing the swapcache to get rid of the swapcache * reference only in case it's likely that we'll be the exlusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && folio_ref_count(folio) == (1 + folio_nr_pages(folio)); } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) return 0; /* * Be careful so that we will only recover a special uffd-wp pte into a * none pte. Otherwise it means the pte could have changed, so retry. * * This should also cover the case where e.g. the pte changed * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. * So is_pte_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } static vm_fault_t do_pte_missing(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); else return do_fault(vmf); } /* * This is actually a page-missing access, but with uffd-wp special pte * installed. It means this pte was wr-protected before being unmapped. */ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) { /* * Just in case there're leftover special ptes even after the region * got unregistered - we can simply clear them. */ if (unlikely(!userfaultfd_wp(vmf->vma))) return pte_marker_clear(vmf); return do_pte_missing(vmf); } static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); unsigned long marker = pte_marker_get(entry); /* * PTE markers should never be empty. If anything weird happened, * the best thing to do is to kill the process along with its mm. */ if (WARN_ON_ONCE(!marker)) return VM_FAULT_SIGBUS; /* Higher priority than uffd-wp when data corrupted */ if (marker & PTE_MARKER_POISONED) return VM_FAULT_HWPOISON; /* Hitting a guard page is always a fatal condition. */ if (marker & PTE_MARKER_GUARD) return VM_FAULT_SIGSEGV; if (pte_marker_entry_uffd_wp(entry)) return pte_marker_handle_uffd_wp(vmf); /* This is an unknown pte marker */ return VM_FAULT_SIGBUS; } static struct folio *__alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; swp_entry_t entry; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; entry = pte_to_swp_entry(vmf->orig_pte); if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { folio_put(folio); return NULL; } return folio; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); int i; /* * While allocating a large folio and doing swap_read_folio, which is * the case the being faulted pte doesn't have swapcache. We need to * ensure all PTEs have no cache as well, otherwise, we might go to * swap devices while the content is in swapcache. */ for (i = 0; i < max_nr; i++) { if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) return i; } return i; } /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. */ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; swp_entry_t entry; int idx; pte_t pte; addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); idx = (vmf->address - addr) / PAGE_SIZE; pte = ptep_get(ptep); if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; entry = pte_to_swp_entry(pte); if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; /* * swap_read_folio() can't handle the case a large folio is hybridly * from different backends. And they are likely corner cases. Similar * things might be added once zswap support large folios. */ if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) return false; if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) return false; return true; } static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, unsigned long addr, unsigned long orders) { int order, nr; order = highest_order(orders); /* * To swap in a THP with nr pages, we require that its first swap_offset * is aligned with that number, as it was when the THP was swapped out. * This helps filter out most invalid entries. */ while (orders) { nr = 1 << order; if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr) break; order = next_order(&orders, order); } return orders; } static struct folio *alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long orders; struct folio *folio; unsigned long addr; swp_entry_t entry; spinlock_t *ptl; pte_t *pte; gfp_t gfp; int order; /* * If uffd is active for the vma we need per-page fault fidelity to * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) goto fallback; /* * A large swapped out folio could be partially or fully in zswap. We * lack handling for such cases, so fallback to swapping in order-0 * folio. */ if (!zswap_never_enabled()) goto fallback; entry = pte_to_swp_entry(vmf->orig_pte); /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); if (!orders) goto fallback; pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl); if (unlikely(!pte)) goto fallback; /* * For do_swap_page, find the highest order where the aligned range is * completely swap entries with contiguous swap offsets. */ order = highest_order(orders); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order)) break; order = next_order(&orders, order); } pte_unmap_unlock(pte, ptl); /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; folio_put(folio); } order = next_order(&orders, order); } fallback: return __alloc_swap_folio(vmf); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static struct folio *alloc_swap_folio(struct vm_fault *vmf) { return __alloc_swap_folio(vmf); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. * * We return with the mmap_lock locked or unlocked in the same cases * as does filemap_fault(). */ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *swapcache, *folio = NULL; DECLARE_WAITQUEUE(wait, current); struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; bool need_clear_cache = false; bool exclusive = false; swp_entry_t entry; pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; int nr_pages; unsigned long page_idx; unsigned long address; pte_t *ptep; if (!pte_unmap_same(vmf)) goto out; entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_exclusive_entry(entry)) { vmf->page = pfn_swap_entry_to_page(entry); ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) { /* * migrate_to_ram is not yet ready to operate * under VMA lock. */ vma_end_read(vma); ret = VM_FAULT_RETRY; goto out; } vmf->page = pfn_swap_entry_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto unlock; /* * Get a page reference while we know the page can't be * freed. */ get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; } /* Prevent swapoff from happening to us. */ si = get_swap_device(entry); if (unlikely(!si)) goto out; folio = swap_cache_get_folio(entry, vma, vmf->address); if (folio) page = folio_file_page(folio, swp_offset(entry)); swapcache = folio; if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ folio = alloc_swap_folio(vmf); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); nr_pages = folio_nr_pages(folio); if (folio_test_large(folio)) entry.val = ALIGN_DOWN(entry.val, nr_pages); /* * Prevent parallel swapin from proceeding with * the cache flag. Otherwise, another thread * may finish swapin first, free the entry, and * swapout reusing the same entry. It's * undetectable as pte_same() returns true due * to entry reuse. */ if (swapcache_prepare(entry, nr_pages)) { /* * Relax a bit to prevent rapid * repeated page faults. */ add_wait_queue(&swapcache_wq, &wait); schedule_timeout_uninterruptible(1); remove_wait_queue(&swapcache_wq, &wait); goto out_page; } need_clear_cache = true; mem_cgroup_swapin_uncharge_swap(entry, nr_pages); shadow = get_shadow_from_swap_cache(entry); if (shadow) workingset_refault(folio, shadow); folio_add_lru(folio); /* To provide entry to swap_read_folio() */ folio->swap = entry; swap_read_folio(folio, NULL); folio->private = NULL; } } else { folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); swapcache = folio; } if (!folio) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; goto unlock; } /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); page = folio_file_page(folio, swp_offset(entry)); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; goto out_release; } ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; if (swapcache) { /* * Make sure folio_free_swap() or swapoff did not release the * swapcache from under us. The page pin, and pte_same test * below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not * changed. */ if (unlikely(!folio_test_swapcache(folio) || page_swap_entry(page).val != entry.val)) goto out_page; /* * KSM sometimes has to copy on read faults, for example, if * page->index of !PageKSM() pages would be nonlinear inside the * anon VMA -- PageKSM() is lost on actual swapout. */ folio = ksm_might_need_to_copy(folio, vma, vmf->address); if (unlikely(!folio)) { ret = VM_FAULT_OOM; folio = swapcache; goto out_page; } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { ret = VM_FAULT_HWPOISON; folio = swapcache; goto out_page; } if (folio != swapcache) page = folio_page(folio, 0); /* * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * owner. Try removing the extra reference from the local LRU * caches if required. */ if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } folio_throttle_swaprate(folio, GFP_KERNEL); /* * Back out if somebody else already faulted in this pte. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; if (unlikely(!folio_test_uptodate(folio))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } /* allocated large folios for SWP_SYNCHRONOUS_IO */ if (folio_test_large(folio) && !folio_test_swapcache(folio)) { unsigned long nr = folio_nr_pages(folio); unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; pte_t *folio_ptep = vmf->pte - idx; pte_t folio_pte = ptep_get(folio_ptep); if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || swap_pte_batch(folio_ptep, nr, folio_pte) != nr) goto out_nomap; page_idx = idx; address = folio_start; ptep = folio_ptep; goto check_folio; } nr_pages = 1; page_idx = 0; address = vmf->address; ptep = vmf->pte; if (folio_test_large(folio) && folio_test_swapcache(folio)) { int nr = folio_nr_pages(folio); unsigned long idx = folio_page_idx(folio, page); unsigned long folio_start = address - idx * PAGE_SIZE; unsigned long folio_end = folio_start + nr * PAGE_SIZE; pte_t *folio_ptep; pte_t folio_pte; if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start))) goto check_folio; if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end))) goto check_folio; folio_ptep = vmf->pte - idx; folio_pte = ptep_get(folio_ptep); if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || swap_pte_batch(folio_ptep, nr, folio_pte) != nr) goto check_folio; page_idx = idx; address = folio_start; ptep = folio_ptep; nr_pages = nr; entry = folio->swap; page = &folio->page; } check_folio: /* * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte * must never point at an anonymous page in the swapcache that is * PG_anon_exclusive. Sanity check that this holds and especially, that * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity * check after taking the PT lock and making sure that nobody * concurrently faulted in this page and set PG_anon_exclusive. */ BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ if (!folio_test_ksm(folio)) { exclusive = pte_swp_exclusive(vmf->orig_pte); if (folio != swapcache) { /* * We have a fresh page that is not exposed to the * swapcache -> certainly exclusive. */ exclusive = true; } else if (exclusive && folio_test_writeback(folio) && data_race(si->flags & SWP_STABLE_WRITES)) { /* * This is tricky: not all swap backends support * concurrent page modifications while under writeback. * * So if we stumble over such a page in the swapcache * we must not set the page exclusive, otherwise we can * map it writable without further checks and modify it * while still under writeback. * * For these problematic swap backends, simply drop the * exclusive marker: this is perfectly fine as we start * writeback only if we fully unmapped the page and * there are no unexpected references on the page after * unmapping succeeded. After fully unmapped, no * further GUP references (FOLL_GET and FOLL_PIN) can * appear, so dropping the exclusive marker and mapping * it only R/O is fine. */ exclusive = false; } } /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ arch_swap_restore(folio_swap(entry, folio), folio); /* * Remove the swap entry and conditionally try to free up the swapcache. * We're already holding a reference on the page but haven't mapped it * yet. */ swap_free_nr(entry, nr_pages); if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); pte = mk_pte(page, vma->vm_page_prot); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(vmf->orig_pte)) pte = pte_mkuffd_wp(pte); /* * Same logic as in do_wp_page(); however, optimize for pages that are * certainly not shared either because we just allocated them without * exposing them to the swapcache or because the swap entry indicates * exclusivity. */ if (!folio_test_ksm(folio) && (exclusive || folio_ref_count(folio) == 1)) { if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) && !pte_needs_soft_dirty_wp(vma, pte)) { pte = pte_mkwrite(pte, vma); if (vmf->flags & FAULT_FLAG_WRITE) { pte = pte_mkdirty(pte); vmf->flags &= ~FAULT_FLAG_WRITE; } } rmap_flags |= RMAP_EXCLUSIVE; } folio_ref_add(folio, nr_pages - 1); flush_icache_pages(vma, page, nr_pages); vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else if (!folio_test_anon(folio)) { /* * We currently only expect small !anon folios which are either * fully exclusive or fully shared, or new allocated large * folios which are fully exclusive. If we ever get large * folios within swapcache here, we have to be careful. */ VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, rmap_flags); } VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); set_ptes(vma->vm_mm, address, ptep, pte, nr_pages); arch_do_swap_page_nr(vma->vm_mm, vma, address, pte, pte, nr_pages); folio_unlock(folio); if (folio != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check * (to avoid false positives from pte_same). For * further safety release the lock after the swap_free * so that the swap count won't change under a * parallel locked swapcache. */ folio_unlock(swapcache); folio_put(swapcache); } if (vmf->flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ update_mmu_cache_range(vmf, vma, address, ptep, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out: /* Clear the swap cache pin for direct swapin after PTL unlock */ if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); if (waitqueue_active(&swapcache_wq)) wake_up(&swapcache_wq); } if (si) put_swap_device(si); return ret; out_nomap: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: folio_unlock(folio); out_release: folio_put(folio); if (folio != swapcache && swapcache) { folio_unlock(swapcache); folio_put(swapcache); } if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); if (waitqueue_active(&swapcache_wq)) wake_up(&swapcache_wq); } if (si) put_swap_device(si); return ret; } static bool pte_range_none(pte_t *pte, int nr_pages) { int i; for (i = 0; i < nr_pages; i++) { if (!pte_none(ptep_get_lockless(pte + i))) return false; } return true; } static struct folio *alloc_anon_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long orders; struct folio *folio; unsigned long addr; pte_t *pte; gfp_t gfp; int order; /* * If uffd is active for the vma we need per-page fault fidelity to * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) goto fallback; /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) goto fallback; pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK); if (!pte) return ERR_PTR(-EAGAIN); /* * Find the highest order where the aligned range is completely * pte_none(). Note that all remaining orders will be completely * pte_none(). */ order = highest_order(orders); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); if (pte_range_none(pte + pte_index(addr), 1 << order)) break; order = next_order(&orders, order); } pte_unmap(pte); if (!orders) goto fallback; /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); folio_put(folio); goto next; } folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation * (__GFP_ZERO not used), folio_zero_user() is used * to make sure that the page corresponding to the * faulting address will be hot in the cache after * zeroing. */ if (!alloc_zeroed()) folio_zero_user(folio, vmf->address); return folio; } next: count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); } fallback: #endif return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; int nr_pages = 1; pte_t entry; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; /* * Use pte_alloc() instead of pte_alloc_map(), so that OOM can * be distinguished from a transient failure of pte_offset_map(). */ if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) goto unlock; if (vmf_pte_changed(vmf)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto unlock; } ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } /* Allocate our own private page. */ ret = vmf_anon_prepare(vmf); if (ret) return ret; /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ folio = alloc_anon_folio(vmf); if (IS_ERR(folio)) return 0; if (!folio) goto oom; nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ __folio_mark_uptodate(folio); entry = mk_pte(&folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; if (nr_pages == 1 && vmf_pte_changed(vmf)) { update_mmu_tlb(vma, addr, vmf->pte); goto release; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); goto release; } ret = check_stable_address_space(vma->vm_mm); if (ret) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: if (vmf_orig_pte_uffd_wp(vmf)) entry = pte_mkuffd_wp(entry); set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); /* No need to invalidate - it was non-present before */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: folio_put(folio); goto unlock; oom: return VM_FAULT_OOM; } /* * The mmap_lock must have been held on entry, and may have been * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; vm_fault_t ret; /* * Preallocate pte before we take page_lock because this might lead to * deadlocks for memcg reclaim which waits for pages under writeback: * lock_page(A) * SetPageWriteback(A) * unlock_page(A) * lock_page(B) * lock_page(B) * pte_alloc_one * shrink_folio_list * wait_on_page_writeback(A) * SetPageWriteback(B) * unlock_page(B) * # flush A, B to clear the writeback */ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) return ret; folio = page_folio(vmf->page); if (unlikely(PageHWPoison(vmf->page))) { vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { if (page_mapped(vmf->page)) unmap_mapping_folio(folio); /* Retry if a clean folio was removed from the cache. */ if (mapping_evict_folio(folio->mapping, folio)) poisonret = VM_FAULT_NOPAGE; folio_unlock(folio); } folio_put(folio); vmf->page = NULL; return poisonret; } if (unlikely(!(ret & VM_FAULT_LOCKED))) folio_lock(folio); else VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page); return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; /* * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any * PMD mappings if THPs are disabled. */ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; if (folio_order(folio) != HPAGE_PMD_ORDER) return ret; page = &folio->page; /* * Just backoff if any subpage of a THP is corrupted otherwise * the corrupted page may mapped by PMD silently to escape the * check. This kind of THP just can be PTE mapped. Access to * the corrupted subpage should trigger SIGBUS as expected. */ if (unlikely(folio_test_has_hwpoisoned(folio))) return ret; /* * Archs like ppc64 need additional space to store information * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) goto out; flush_icache_pages(vma, page, HPAGE_PMD_NR); entry = mk_huge_pmd(page, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR); folio_add_file_rmap_pmd(folio, page, vma); /* * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) deposit_prealloc_pte(vmf); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; count_vm_event(THP_FILE_MAPPED); out: spin_unlock(vmf->ptl); return ret; } #else vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { return VM_FAULT_FALLBACK; } #endif /** * set_pte_range - Set a range of PTEs to point to pages in a folio. * @vmf: Fault decription. * @folio: The folio that contains @page. * @page: The first page to create a PTE for. * @nr: The number of PTEs to create. * @addr: The first address to create a PTE for. */ void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; flush_icache_pages(vma, page, nr); entry = mk_pte(page, vma->vm_page_prot); if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); else entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { VM_BUG_ON_FOLIO(nr != 1, folio); folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr); } static bool vmf_pte_changed(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); return !pte_none(ptep_get(vmf->pte)); } /** * finish_fault - finish page fault once we have prepared the page to fault * * @vmf: structure describing the fault * * This function handles all that is needed to finish a page fault once the * page to fault in is prepared. It handles locking of PTEs, inserts PTE for * given page, adds reverse page mapping, handles memcg charges and LRU * addition. * * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). * * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t finish_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; struct folio *folio; vm_fault_t ret; bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); int type, nr_pages; unsigned long addr = vmf->address; /* Did we COW the page? */ if (is_cow) page = vmf->cow_page; else page = vmf->page; /* * check even for read faults because we might have lost our CoWed * page */ if (!(vma->vm_flags & VM_SHARED)) { ret = check_stable_address_space(vma->vm_mm); if (ret) return ret; } if (pmd_none(*vmf->pmd)) { if (PageTransCompound(page)) { ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; } if (vmf->prealloc_pte) pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) return VM_FAULT_OOM; } folio = page_folio(page); nr_pages = folio_nr_pages(folio); /* * Using per-page fault to maintain the uffd semantics, and same * approach also applies to non-anonymous-shmem faults to avoid * inflating the RSS of the process. */ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); /* The page offset of vmf->address within the VMA. */ pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; /* The index of the entry in the pagetable for fault page. */ pgoff_t pte_off = pte_index(vmf->address); /* * Fallback to per-page fault in case the folio size in page * cache beyond the VMA limits and PMD pagetable limits. */ if (unlikely(vma_off < idx || vma_off + (nr_pages - idx) > vma_pages(vma) || pte_off < idx || pte_off + (nr_pages - idx) > PTRS_PER_PTE)) { nr_pages = 1; } else { /* Now we can set mappings for the whole large folio. */ addr = vmf->address - idx * PAGE_SIZE; page = &folio->page; } } vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) return VM_FAULT_NOPAGE; /* Re-check under ptl */ if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) { update_mmu_tlb(vma, addr, vmf->pte); ret = VM_FAULT_NOPAGE; goto unlock; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); ret = VM_FAULT_NOPAGE; goto unlock; } folio_ref_add(folio, nr_pages - 1); set_pte_range(vmf, folio, page, nr_pages, addr); type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); add_mm_counter(vma->vm_mm, type, nr_pages); ret = 0; unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } static unsigned long fault_around_pages __read_mostly = 65536 >> PAGE_SHIFT; #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) { *val = fault_around_pages << PAGE_SHIFT; return 0; } /* * fault_around_bytes must be rounded down to the nearest page order as it's * what do_fault_around() expects to see. */ static int fault_around_bytes_set(void *data, u64 val) { if (val / PAGE_SIZE > PTRS_PER_PTE) return -EINVAL; /* * The minimum value is 1 page, however this results in no fault-around * at all. See should_fault_around(). */ val = max(val, PAGE_SIZE); fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, &fault_around_bytes_fops); return 0; } late_initcall(fault_around_debugfs); #endif /* * do_fault_around() tries to map few pages around the fault address. The hope * is that the pages will be needed soon and this will lower the number of * faults to handle. * * It uses vm_ops->map_pages() to map the pages, which skips the page if it's * not ready to be mapped: not up-to-date, locked, etc. * * This function doesn't cross VMA or page table boundaries, in order to call * map_pages() and acquire a PTE lock only once. * * fault_around_pages defines how many pages we'll try to map. * do_fault_around() expects it to be set to a power of two less than or equal * to PTRS_PER_PTE. * * The virtual address of the area that we map is naturally aligned to * fault_around_pages * PAGE_SIZE rounded down to the machine page size * (and therefore to page order). This way it's easier to guarantee * that we don't cross page table boundaries. */ static vm_fault_t do_fault_around(struct vm_fault *vmf) { pgoff_t nr_pages = READ_ONCE(fault_around_pages); pgoff_t pte_off = pte_index(vmf->address); /* The page offset of vmf->address within the VMA. */ pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; pgoff_t from_pte, to_pte; vm_fault_t ret; /* The PTE offset of the start address, clamped to the VMA. */ from_pte = max(ALIGN_DOWN(pte_off, nr_pages), pte_off - min(pte_off, vma_off)); /* The PTE offset of the end address, clamped to the VMA and PTE. */ to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE, pte_off + vma_pages(vmf->vma) - vma_off) - 1; if (pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } rcu_read_lock(); ret = vmf->vma->vm_ops->map_pages(vmf, vmf->pgoff + from_pte - pte_off, vmf->pgoff + to_pte - pte_off); rcu_read_unlock(); return ret; } /* Return true if we should do read fault-around, false otherwise */ static inline bool should_fault_around(struct vm_fault *vmf) { /* No ->map_pages? No way to fault around... */ if (!vmf->vma->vm_ops->map_pages) return false; if (uffd_disable_fault_around(vmf->vma)) return false; /* A single page implies no faulting 'around' at all. */ return fault_around_pages > 1; } static vm_fault_t do_read_fault(struct vm_fault *vmf) { vm_fault_t ret = 0; struct folio *folio; /* * Let's call ->map_pages() first and use ->fault() as fallback * if page by the offset is not ready to be mapped (cold cache or * something). */ if (should_fault_around(vmf)) { ret = do_fault_around(vmf); if (ret) return ret; } ret = vmf_can_call_fault(vmf); if (ret) return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; ret |= finish_fault(vmf); folio = page_folio(vmf->page); folio_unlock(folio); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) folio_put(folio); return ret; } static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; vm_fault_t ret; ret = vmf_can_call_fault(vmf); if (!ret) ret = vmf_anon_prepare(vmf); if (ret) return ret; folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false); if (!folio) return VM_FAULT_OOM; vmf->cow_page = &folio->page; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (ret & VM_FAULT_DONE_COW) return ret; if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) { ret = VM_FAULT_HWPOISON; goto unlock; } __folio_mark_uptodate(folio); ret |= finish_fault(vmf); unlock: unlock_page(vmf->page); put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: folio_put(folio); return ret; } static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; struct folio *folio; ret = vmf_can_call_fault(vmf); if (ret) return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; folio = page_folio(vmf->page); /* * Check if the backing address space wants to know that the page is * about to become writable */ if (vma->vm_ops->page_mkwrite) { folio_unlock(folio); tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); return tmp; } } ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { folio_unlock(folio); folio_put(folio); return ret; } ret |= fault_dirty_shared_page(vmf); return ret; } /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults). * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). * If mmap_lock is released, vma may become invalid (for example * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) ret = VM_FAULT_SIGBUS; else { /* * Make sure this is not a temporary clearing of pte * by holding ptl and checking again. A R/M/W update * of pte involves: take ptl, clearing the pte so that * we don't have concurrent modification by hardware * followed by an update. */ if (unlikely(pte_none(ptep_get(vmf->pte)))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; pte_unmap_unlock(vmf->pte, vmf->ptl); } } else if (!(vmf->flags & FAULT_FLAG_WRITE)) ret = do_read_fault(vmf); else if (!(vma->vm_flags & VM_SHARED)) ret = do_cow_fault(vmf); else ret = do_shared_fault(vmf); /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; } int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid) { struct vm_area_struct *vma = vmf->vma; /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses * the case where a mapping is writable but the process never writes * to it but pte_write gets cleared during protection updates and * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ if (!writable) *flags |= TNF_NO_GROUP; /* * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) *flags |= TNF_SHARED; /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. */ if (folio_use_access_time(folio)) *last_cpupid = (-1 & LAST_CPUPID_MASK); else *last_cpupid = folio_last_cpupid(folio); /* Record the current PID acceesing VMA */ vma_set_access_pid_bit(vma); count_vm_numa_event(NUMA_HINT_FAULTS); #ifdef CONFIG_NUMA_BALANCING count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1); #endif if (folio_nid(folio) == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); *flags |= TNF_FAULT_LOCAL; } return mpol_misplaced(folio, vmf, addr); } static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long fault_addr, pte_t *fault_pte, bool writable) { pte_t pte, old_pte; old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (writable) pte = pte_mkwrite(pte, vma); ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); } static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, struct folio *folio, pte_t fault_pte, bool ignore_writable, bool pte_write_upgrade) { int nr = pte_pfn(fault_pte) - folio_pfn(folio); unsigned long start, end, addr = vmf->address; unsigned long addr_start = addr - (nr << PAGE_SHIFT); unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE); pte_t *start_ptep; /* Stay within the VMA and within the page table. */ start = max3(addr_start, pt_start, vma->vm_start); end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE, vma->vm_end); start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); /* Restore all PTEs' mapping of the large folio */ for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(start_ptep); bool writable = false; if (!pte_present(ptent) || !pte_protnone(ptent)) continue; if (pfn_folio(pte_pfn(ptent)) != folio) continue; if (!ignore_writable) { ptent = pte_modify(ptent, vma->vm_page_prot); writable = pte_write(ptent); if (!writable && pte_write_upgrade && can_change_pte_writable(vma, addr, ptent)) writable = true; } numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); } } static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; int nid = NUMA_NO_NODE; bool writable = false, ignore_writable = false; bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma); int last_cpupid; int target_nid; pte_t pte, old_pte; int flags = 0, nr_pages; /* * The pte cannot be used safely until we verify, while holding the page * table lock, that its contents have not changed during fault handling. */ spin_lock(vmf->ptl); /* Read the live PTE from the page tables: */ old_pte = ptep_get(vmf->pte); if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } pte = pte_modify(old_pte, vma->vm_page_prot); /* * Detect now whether the PTE could be writable; this information * is only valid while holding the PT lock. */ writable = pte_write(pte); if (!writable && pte_write_upgrade && can_change_pte_writable(vma, vmf->address, pte)) writable = true; folio = vm_normal_folio(vma, vmf->address, pte); if (!folio || folio_is_zone_device(folio)) goto out_map; nid = folio_nid(folio); nr_pages = folio_nr_pages(folio); target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags, writable, &last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { flags |= TNF_MIGRATE_FAIL; goto out_map; } /* The folio is isolated and isolation code holds a folio reference. */ pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; ignore_writable = true; /* Migrate to the requested node */ if (!migrate_misplaced_folio(folio, vma, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; } flags |= TNF_MIGRATE_FAIL; vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) return 0; if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ if (folio && folio_test_large(folio)) numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, pte_write_upgrade); else numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, writable); pte_unmap_unlock(vmf->pte, vmf->ptl); if (nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PMD_ORDER); return VM_FAULT_FALLBACK; } /* `inline' is required to avoid gcc 4.1.2 build error */ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; vm_fault_t ret; if (vma_is_anonymous(vma)) { if (likely(!unshare) && userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { if (userfaultfd_wp_async(vmf->vma)) goto split; return handle_userfault(vmf, VM_UFFD_WP); } return do_huge_pmd_wp_page(vmf); } if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } split: /* COW or write-notify handled on pte level: split pmd. */ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) struct vm_area_struct *vma = vmf->vma; /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PUD_ORDER); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } split: /* COW or write-notify not handled on PUD level: split pud.*/ __split_huge_pud(vma, vmf->pud, vmf->address); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ return VM_FAULT_FALLBACK; } /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. * * There is also a hook called "update_mmu_cache()" that architectures * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow * concurrent faults). * * The mmap_lock may have been released depending on flags and our return value. * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ vmf->pte = NULL; vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; } else { pmd_t dummy_pmdval; /* * A regular pmd is established and it can't morph into a huge * pmd by anon khugepaged, since that takes mmap_lock in write * mode; but shmem or file collapse to THP could still morph * it into a huge pmd: just retry later if so. * * Use the maywrite version to indicate that vmf->pte may be * modified, but since we will use pte_same() to detect the * change of the !pte_none() entry, there is no need to recheck * the pmdval. Here we chooes to pass a dummy variable instead * of NULL, which helps new user think about why this place is * special. */ vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &dummy_pmdval, &vmf->ptl); if (unlikely(!vmf->pte)) return 0; vmf->orig_pte = ptep_get_lockless(vmf->pte); vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; } } if (!vmf->pte) return do_pte_missing(vmf); if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); spin_lock(vmf->ptl); entry = vmf->orig_pte; if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!pte_write(entry)) return do_wp_page(vmf); else if (likely(vmf->flags & FAULT_FLAG_WRITE)) entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE)) { update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte, 1); } else { /* Skip spurious TLB flush for retried page fault */ if (vmf->flags & FAULT_FLAG_TRIED) goto unlock; /* * This is needed only for protection faults but the arch code * is not yet telling us if this is a protection fault or not. * This still avoids useless tlb flushes for .text page faults * with threads. */ if (vmf->flags & FAULT_FLAG_WRITE) flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, vmf->pte); } unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* * On entry, we hold either the VMA lock or the mmap_lock * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in * the result, the mmap_lock is not held on exit. See filemap_fault() * and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; unsigned long vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) return VM_FAULT_OOM; vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && thp_vma_allowable_order(vma, vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { pud_t orig_pud = *vmf.pud; barrier(); if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { /* * TODO once we support anonymous PUDs: NUMA case and * FAULT_FLAG_UNSHARE handling. */ if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) { ret = wp_huge_pud(&vmf, orig_pud); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pud_set_accessed(&vmf, orig_pud); return 0; } } } vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; /* Huge pud page fault raced with pmd_alloc? */ if (pud_trans_unstable(vmf.pud)) goto retry_pud; if (pmd_none(*vmf.pmd) && thp_vma_allowable_order(vma, vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); if (is_pmd_migration_entry(vmf.orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; } if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf); if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !pmd_write(vmf.orig_pmd)) { ret = wp_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pmd_set_accessed(&vmf); return 0; } } } return handle_pte_fault(&vmf); } /** * mm_account_fault - Do page fault accounting * @mm: mm from which memcg should be extracted. It can be NULL. * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting * of perf event counters, but we'll still do the per-task accounting to * the task who triggered this page fault. * @address: the faulted address. * @flags: the fault flags. * @ret: the fault retcode. * * This will take care of most of the page fault accounting. Meanwhile, it * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should * still be in per-arch page fault handlers at the entry of page fault. */ static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs, unsigned long address, unsigned int flags, vm_fault_t ret) { bool major; /* Incomplete faults will be accounted upon completion. */ if (ret & VM_FAULT_RETRY) return; /* * To preserve the behavior of older kernels, PGFAULT counters record * both successful and failed faults, as opposed to perf counters, * which ignore failed cases. */ count_vm_event(PGFAULT); count_memcg_event_mm(mm, PGFAULT); /* * Do not account for unsuccessful faults (e.g. when the address wasn't * valid). That includes arch_vma_access_permitted() failing before * reaching here. So this is not a "this many hardware page faults" * counter. We should use the hw profiling for that. */ if (ret & VM_FAULT_ERROR) return; /* * We define the fault as a major fault when the final successful fault * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't * handle it immediately previously). */ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED); if (major) current->maj_flt++; else current->min_flt++; /* * If the fault is done for GUP, regs will be NULL. We only do the * accounting for the per thread fault counters who triggered the * fault, and we skip the perf event updates. */ if (!regs) return; if (major) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); else perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } #ifdef CONFIG_LRU_GEN static void lru_gen_enter_fault(struct vm_area_struct *vma) { /* the LRU algorithm only applies to accesses with recency */ current->in_lru_fault = vma_has_recency(vma); } static void lru_gen_exit_fault(void) { current->in_lru_fault = false; } #else static void lru_gen_enter_fault(struct vm_area_struct *vma) { } static void lru_gen_exit_fault(void) { } #endif /* CONFIG_LRU_GEN */ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, unsigned int *flags) { if (unlikely(*flags & FAULT_FLAG_UNSHARE)) { if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE)) return VM_FAULT_SIGSEGV; /* * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's * just treat it like an ordinary read-fault otherwise. */ if (!is_cow_mapping(vma->vm_flags)) *flags &= ~FAULT_FLAG_UNSHARE; } else if (*flags & FAULT_FLAG_WRITE) { /* Write faults on read-only mappings are impossible ... */ if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) return VM_FAULT_SIGSEGV; /* ... and FOLL_FORCE only applies to COW mappings. */ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && !is_cow_mapping(vma->vm_flags))) return VM_FAULT_SIGSEGV; } #ifdef CONFIG_PER_VMA_LOCK /* * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of * the assumption that lock is dropped on VM_FAULT_RETRY. */ if (WARN_ON_ONCE((*flags & (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) == (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT))) return VM_FAULT_SIGSEGV; #endif return 0; } /* * By the time we get here, we already hold the mm semaphore * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* If the fault handler drops the mmap_lock, vma may be freed */ struct mm_struct *mm = vma->vm_mm; vm_fault_t ret; bool is_droppable; __set_current_state(TASK_RUNNING); ret = sanitize_fault_flags(vma, &flags); if (ret) goto out; if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) { ret = VM_FAULT_SIGSEGV; goto out; } is_droppable = !!(vma->vm_flags & VM_DROPPABLE); /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) mem_cgroup_enter_user_fault(); lru_gen_enter_fault(vma); if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else ret = __handle_mm_fault(vma, address, flags); /* * Warning: It is no longer safe to dereference vma-> after this point, * because mmap_lock might have been dropped by __handle_mm_fault(), so * vma might be destroyed from underneath us. */ lru_gen_exit_fault(); /* If the mapping is droppable, then errors due to OOM aren't fatal. */ if (is_droppable) ret &= ~VM_FAULT_OOM; if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* * The task may have entered a memcg OOM situation but * if the allocation error was handled gracefully (no * VM_FAULT_OOM), there is no need to kill anything. * Just clean up the OOM state peacefully. */ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) mem_cgroup_oom_synchronize(false); } out: mm_account_fault(mm, regs, address, flags, ret); return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); #ifdef CONFIG_LOCK_MM_AND_FIND_VMA #include <linux/extable.h> static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { if (likely(mmap_read_trylock(mm))) return true; if (regs && !user_mode(regs)) { unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } return !mmap_read_lock_killable(mm); } static inline bool mmap_upgrade_trylock(struct mm_struct *mm) { /* * We don't have this operation yet. * * It should be easy enough to do: it's basically a * atomic_long_try_cmpxchg_acquire() * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but * it also needs the proper lockdep magic etc. */ return false; } static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { mmap_read_unlock(mm); if (regs && !user_mode(regs)) { unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } return !mmap_write_lock_killable(mm); } /* * Helper for page fault handling. * * This is kind of equivalend to "mmap_read_lock()" followed * by "find_extend_vma()", except it's a lot more careful about * the locking (and will drop the lock on failure). * * For example, if we have a kernel bug that causes a page * fault, we don't want to just use mmap_read_lock() to get * the mm lock, because that would deadlock if the bug were * to happen while we're holding the mm lock for writing. * * So this checks the exception tables on kernel faults in * order to only do this all for instructions that are actually * expected to fault. * * We can also actually take the mm lock for writing if we * need to extend the vma, which helps the VM layer a lot. */ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long addr, struct pt_regs *regs) { struct vm_area_struct *vma; if (!get_mmap_lock_carefully(mm, regs)) return NULL; vma = find_vma(mm, addr); if (likely(vma && (vma->vm_start <= addr))) return vma; /* * Well, dang. We might still be successful, but only * if we can extend a vma to do so. */ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { mmap_read_unlock(mm); return NULL; } /* * We can try to upgrade the mmap lock atomically, * in which case we can continue to use the vma * we already looked up. * * Otherwise we'll have to drop the mmap lock and * re-take it, and also look up the vma again, * re-checking it. */ if (!mmap_upgrade_trylock(mm)) { if (!upgrade_mmap_lock_carefully(mm, regs)) return NULL; vma = find_vma(mm, addr); if (!vma) goto fail; if (vma->vm_start <= addr) goto success; if (!(vma->vm_flags & VM_GROWSDOWN)) goto fail; } if (expand_stack_locked(vma, addr)) goto fail; success: mmap_write_downgrade(mm); return vma; fail: mmap_write_unlock(mm); return NULL; } #endif #ifdef CONFIG_PER_VMA_LOCK /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the * function returns NULL. */ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { MA_STATE(mas, &mm->mm_mt, address, address); struct vm_area_struct *vma; rcu_read_lock(); retry: vma = mas_walk(&mas); if (!vma) goto inval; if (!vma_start_read(vma)) goto inval; /* Check if the VMA got isolated after we found it */ if (vma->detached) { vma_end_read(vma); count_vm_vma_lock_event(VMA_LOCK_MISS); /* The area was replaced with another one */ goto retry; } /* * At this point, we have a stable reference to a VMA: The VMA is * locked and we know it hasn't already been isolated. * From here on, we can access the VMA without worrying about which * fields are accessible for RCU readers. */ /* Check since vm_start/vm_end might change before we lock the VMA */ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); return vma; inval_end_read: vma_end_read(vma); inval: rcu_read_unlock(); count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } #endif /* CONFIG_PER_VMA_LOCK */ #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. * We've already handled the fast-path in-line. */ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { p4d_t *new = p4d_alloc_one(mm, address); if (!new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) { /* Another has populated it */ p4d_free(mm, new); } else { smp_wmb(); /* See comment in pmd_install() */ pgd_populate(mm, pgd, new); } spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_P4D_FOLDED */ #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. * We've already handled the fast-path in-line. */ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { pud_t *new = pud_alloc_one(mm, address); if (!new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); smp_wmb(); /* See comment in pmd_install() */ p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. * We've already handled the fast-path in-line. */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { spinlock_t *ptl; pmd_t *new = pmd_alloc_one(mm, address); if (!new) return -ENOMEM; ptl = pud_lock(mm, pud); if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); smp_wmb(); /* See comment in pmd_install() */ pud_populate(mm, pud, new); } else { /* Another has populated it */ pmd_free(mm, new); } spin_unlock(ptl); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, spinlock_t *lock, pte_t *ptep, pgprot_t pgprot, unsigned long pfn_base, unsigned long addr_mask, bool writable, bool special) { args->lock = lock; args->ptep = ptep; args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); args->pgprot = pgprot; args->writable = writable; args->special = special; } static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) { #ifdef CONFIG_LOCKDEP struct file *file = vma->vm_file; struct address_space *mapping = file ? file->f_mapping : NULL; if (mapping) lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) || lockdep_is_held(&vma->vm_mm->mmap_lock)); else lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); #endif } /** * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address * @args: Pointer to struct @follow_pfnmap_args * * The caller needs to setup args->vma and args->address to point to the * virtual address as the target of such lookup. On a successful return, * the results will be put into other output fields. * * After the caller finished using the fields, the caller must invoke * another follow_pfnmap_end() to proper releases the locks and resources * of such look up request. * * During the start() and end() calls, the results in @args will be valid * as proper locks will be held. After the end() is called, all the fields * in @follow_pfnmap_args will be invalid to be further accessed. Further * use of such information after end() may require proper synchronizations * by the caller with page table updates, otherwise it can create a * security bug. * * If the PTE maps a refcounted page, callers are responsible to protect * against invalidation with MMU notifiers; otherwise access to the PFN at * a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore * should be taken for read, and the mmap semaphore cannot be released * before the end() is invoked. * * This function must not be used to modify PTE content. * * Return: zero on success, negative otherwise. */ int follow_pfnmap_start(struct follow_pfnmap_args *args) { struct vm_area_struct *vma = args->vma; unsigned long address = args->address; struct mm_struct *mm = vma->vm_mm; spinlock_t *lock; pgd_t *pgdp; p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; pfnmap_lockdep_assert(vma); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto out; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; retry: pgdp = pgd_offset(mm, address); if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) goto out; p4dp = p4d_offset(pgdp, address); p4d = READ_ONCE(*p4dp); if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; pudp = pud_offset(p4dp, address); pud = READ_ONCE(*pudp); if (pud_none(pud)) goto out; if (pud_leaf(pud)) { lock = pud_lock(mm, pudp); if (!unlikely(pud_leaf(pud))) { spin_unlock(lock); goto retry; } pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), pud_pfn(pud), PUD_MASK, pud_write(pud), pud_special(pud)); return 0; } pmdp = pmd_offset(pudp, address); pmd = pmdp_get_lockless(pmdp); if (pmd_leaf(pmd)) { lock = pmd_lock(mm, pmdp); if (!unlikely(pmd_leaf(pmd))) { spin_unlock(lock); goto retry; } pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), pmd_special(pmd)); return 0; } ptep = pte_offset_map_lock(mm, pmdp, address, &lock); if (!ptep) goto out; pte = ptep_get(ptep); if (!pte_present(pte)) goto unlock; pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), pte_pfn(pte), PAGE_MASK, pte_write(pte), pte_special(pte)); return 0; unlock: pte_unmap_unlock(ptep, lock); out: return -EINVAL; } EXPORT_SYMBOL_GPL(follow_pfnmap_start); /** * follow_pfnmap_end(): End a follow_pfnmap_start() process * @args: Pointer to struct @follow_pfnmap_args * * Must be used in pair of follow_pfnmap_start(). See the start() function * above for more information. */ void follow_pfnmap_end(struct follow_pfnmap_args *args) { if (args->lock) spin_unlock(args->lock); if (args->ptep) pte_unmap(args->ptep); } EXPORT_SYMBOL_GPL(follow_pfnmap_end); #ifdef CONFIG_HAVE_IOREMAP_PROT /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access * @addr: userspace address, not relative offset within @vma * @buf: buffer to read/write * @len: length of transfer * @write: set to FOLL_WRITE when writing, otherwise reading * * This is a generic implementation for &vm_operations_struct.access for an * iomem mapping. This callback is used by access_process_vm() when the @vma is * not page based. */ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; unsigned long prot = 0; void __iomem *maddr; int offset = offset_in_page(addr); int ret = -EINVAL; bool writable; struct follow_pfnmap_args args = { .vma = vma, .address = addr }; retry: if (follow_pfnmap_start(&args)) return -EINVAL; prot = pgprot_val(args.pgprot); phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; writable = args.writable; follow_pfnmap_end(&args); if ((write & FOLL_WRITE) && !writable) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; if (follow_pfnmap_start(&args)) goto out_unmap; if ((prot != pgprot_val(args.pgprot)) || (phys_addr != (args.pfn << PAGE_SHIFT)) || (writable != args.writable)) { follow_pfnmap_end(&args); iounmap(maddr); goto retry; } if (write) memcpy_toio(maddr + offset, buf, len); else memcpy_fromio(buf, maddr + offset, len); ret = len; follow_pfnmap_end(&args); out_unmap: iounmap(maddr); return ret; } EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* * Access another process' address space as given in mm. */ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int write = gup_flags & FOLL_WRITE; if (mmap_read_lock_killable(mm)) return 0; /* Untag the address before looking up the VMA */ addr = untagged_addr_remote(mm, addr); /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr)) return 0; /* ignore errors, just check how much was successfully transferred */ while (len) { int bytes, offset; void *maddr; struct vm_area_struct *vma = NULL; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); if (IS_ERR(page)) { /* We might need to expand the stack to access it */ vma = vma_lookup(mm, addr); if (!vma) { vma = expand_stack(mm, addr); /* mmap_lock was dropped on failure */ if (!vma) return buf - old_buf; /* Try again if stack expansion worked */ continue; } /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ bytes = 0; #ifdef CONFIG_HAVE_IOREMAP_PROT if (vma->vm_ops && vma->vm_ops->access) bytes = vma->vm_ops->access(vma, addr, buf, len, write); #endif if (bytes <= 0) break; } else { bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; maddr = kmap_local_page(page); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } unmap_and_put_page(page, maddr); } len -= bytes; buf += bytes; addr += bytes; } mmap_read_unlock(mm); return buf - old_buf; } /** * access_remote_vm - access another process' address space * @mm: the mm_struct of the target address space * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. * * Return: number of bytes copied from source to destination. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* * Access another process' address space. * Source/target buffer must be kernel space, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; } EXPORT_SYMBOL_GPL(access_process_vm); /* * Print the name of a VMA. */ void print_vma_addr(char *prefix, unsigned long ip) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; /* * we might be running from an atomic context so we cannot sleep */ if (!mmap_read_trylock(mm)) return; vma = vma_lookup(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; ip -= vma->vm_start; ip += vma->vm_pgoff << PAGE_SHIFT; printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip, vma->vm_start, vma->vm_end - vma->vm_start); } mmap_read_unlock(mm); } #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) void __might_fault(const char *file, int line) { if (pagefault_disabled()) return; __might_sleep(file, line); #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(&current->mm->mmap_lock); #endif } EXPORT_SYMBOL(__might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) /* * Process all subpages of the specified huge page with the specified * operation. The target subpage will be processed last to keep its * cache lines hot. */ static inline int process_huge_page( unsigned long addr_hint, unsigned int nr_pages, int (*process_subpage)(unsigned long addr, int idx, void *arg), void *arg) { int i, n, base, l, ret; unsigned long addr = addr_hint & ~(((unsigned long)nr_pages << PAGE_SHIFT) - 1); /* Process target subpage last to keep its cache lines hot */ might_sleep(); n = (addr_hint - addr) / PAGE_SIZE; if (2 * n <= nr_pages) { /* If target subpage in first half of huge page */ base = 0; l = n; /* Process subpages at the end of huge page */ for (i = nr_pages - 1; i >= 2 * n; i--) { cond_resched(); ret = process_subpage(addr + i * PAGE_SIZE, i, arg); if (ret) return ret; } } else { /* If target subpage in second half of huge page */ base = nr_pages - 2 * (nr_pages - n); l = nr_pages - n; /* Process subpages at the begin of huge page */ for (i = 0; i < base; i++) { cond_resched(); ret = process_subpage(addr + i * PAGE_SIZE, i, arg); if (ret) return ret; } } /* * Process remaining subpages in left-right-left-right pattern * towards the target subpage */ for (i = 0; i < l; i++) { int left_idx = base + i; int right_idx = base + 2 * l - 1 - i; cond_resched(); ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); if (ret) return ret; cond_resched(); ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); if (ret) return ret; } return 0; } static void clear_gigantic_page(struct folio *folio, unsigned long addr, unsigned int nr_pages) { int i; might_sleep(); for (i = 0; i < nr_pages; i++) { cond_resched(); clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE); } } static int clear_subpage(unsigned long addr, int idx, void *arg) { struct folio *folio = arg; clear_user_highpage(folio_page(folio, idx), addr); return 0; } /** * folio_zero_user - Zero a folio which will be mapped to userspace. * @folio: The folio to zero. * @addr_hint: The address will be accessed or the base address if uncelar. */ void folio_zero_user(struct folio *folio, unsigned long addr_hint) { unsigned int nr_pages = folio_nr_pages(folio); if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) clear_gigantic_page(folio, addr_hint, nr_pages); else process_huge_page(addr_hint, nr_pages, clear_subpage, folio); } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, unsigned long addr, struct vm_area_struct *vma, unsigned int nr_pages) { int i; struct page *dst_page; struct page *src_page; for (i = 0; i < nr_pages; i++) { dst_page = folio_page(dst, i); src_page = folio_page(src, i); cond_resched(); if (copy_mc_user_highpage(dst_page, src_page, addr + i*PAGE_SIZE, vma)) return -EHWPOISON; } return 0; } struct copy_subpage_arg { struct folio *dst; struct folio *src; struct vm_area_struct *vma; }; static int copy_subpage(unsigned long addr, int idx, void *arg) { struct copy_subpage_arg *copy_arg = arg; struct page *dst = folio_page(copy_arg->dst, idx); struct page *src = folio_page(copy_arg->src, idx); if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) return -EHWPOISON; return 0; } int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma) { unsigned int nr_pages = folio_nr_pages(dst); struct copy_subpage_arg arg = { .dst = dst, .src = src, .vma = vma, }; if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages); return process_huge_page(addr_hint, nr_pages, copy_subpage, &arg); } long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault) { void *kaddr; unsigned long i, rc = 0; unsigned int nr_pages = folio_nr_pages(dst_folio); unsigned long ret_val = nr_pages * PAGE_SIZE; struct page *subpage; for (i = 0; i < nr_pages; i++) { subpage = folio_page(dst_folio, i); kaddr = kmap_local_page(subpage); if (!allow_pagefault) pagefault_disable(); rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE); if (!allow_pagefault) pagefault_enable(); kunmap_local(kaddr); ret_val -= (PAGE_SIZE - rc); if (rc) break; flush_dcache_page(subpage); cond_resched(); } return ret_val; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; void __init ptlock_cache_init(void) { page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, SLAB_PANIC, NULL); } bool ptlock_alloc(struct ptdesc *ptdesc) { spinlock_t *ptl; ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; ptdesc->ptl = ptl; return true; } void ptlock_free(struct ptdesc *ptdesc) { kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif void vma_pgtable_walk_begin(struct vm_area_struct *vma) { if (is_vm_hugetlb_page(vma)) hugetlb_vma_lock_read(vma); } void vma_pgtable_walk_end(struct vm_area_struct *vma) { if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); }
46 45 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/stats.c * * procfs-based user access to generic RPC statistics. The stats files * reside in /proc/net/rpc. * * The read routines assume that the buffer passed in is just big enough. * If you implement an RPC service that has its own stats routine which * appends the generic RPC stats, make sure you don't exceed the PAGE_SIZE * limit. * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/metrics.h> #include <linux/rcupdate.h> #include <trace/events/sunrpc.h> #include "netns.h" #define RPCDBG_FACILITY RPCDBG_MISC /* * Get RPC client stats */ static int rpc_proc_show(struct seq_file *seq, void *v) { const struct rpc_stat *statp = seq->private; const struct rpc_program *prog = statp->program; unsigned int i, j; seq_printf(seq, "net %u %u %u %u\n", statp->netcnt, statp->netudpcnt, statp->nettcpcnt, statp->nettcpconn); seq_printf(seq, "rpc %u %u %u\n", statp->rpccnt, statp->rpcretrans, statp->rpcauthrefresh); for (i = 0; i < prog->nrvers; i++) { const struct rpc_version *vers = prog->version[i]; if (!vers) continue; seq_printf(seq, "proc%u %u", vers->number, vers->nrprocs); for (j = 0; j < vers->nrprocs; j++) seq_printf(seq, " %u", vers->counts[j]); seq_putc(seq, '\n'); } return 0; } static int rpc_proc_open(struct inode *inode, struct file *file) { return single_open(file, rpc_proc_show, pde_data(inode)); } static const struct proc_ops rpc_proc_ops = { .proc_open = rpc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, }; /* * Get RPC server stats */ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { const struct svc_program *prog = statp->program; const struct svc_version *vers; unsigned int i, j, k; unsigned long count; seq_printf(seq, "net %u %u %u %u\n", statp->netcnt, statp->netudpcnt, statp->nettcpcnt, statp->nettcpconn); seq_printf(seq, "rpc %u %u %u %u %u\n", statp->rpccnt, statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt, statp->rpcbadfmt, statp->rpcbadauth, statp->rpcbadclnt); for (i = 0; i < prog->pg_nvers; i++) { vers = prog->pg_vers[i]; if (!vers) continue; seq_printf(seq, "proc%d %u", i, vers->vs_nproc); for (j = 0; j < vers->vs_nproc; j++) { count = 0; for_each_possible_cpu(k) count += per_cpu(vers->vs_count[j], k); seq_printf(seq, " %lu", count); } seq_putc(seq, '\n'); } } EXPORT_SYMBOL_GPL(svc_seq_show); /** * rpc_alloc_iostats - allocate an rpc_iostats structure * @clnt: RPC program, version, and xprt * */ struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { struct rpc_iostats *stats; int i; stats = kcalloc(clnt->cl_maxproc, sizeof(*stats), GFP_KERNEL); if (stats) { for (i = 0; i < clnt->cl_maxproc; i++) spin_lock_init(&stats[i].om_lock); } return stats; } EXPORT_SYMBOL_GPL(rpc_alloc_iostats); /** * rpc_free_iostats - release an rpc_iostats structure * @stats: doomed rpc_iostats structure * */ void rpc_free_iostats(struct rpc_iostats *stats) { kfree(stats); } EXPORT_SYMBOL_GPL(rpc_free_iostats); /** * rpc_count_iostats_metrics - tally up per-task stats * @task: completed rpc_task * @op_metrics: stat structure for OP that will accumulate stats from @task */ void rpc_count_iostats_metrics(const struct rpc_task *task, struct rpc_iostats *op_metrics) { struct rpc_rqst *req = task->tk_rqstp; ktime_t backlog, execute, now; if (!op_metrics || !req) return; now = ktime_get(); spin_lock(&op_metrics->om_lock); op_metrics->om_ops++; /* kernel API: om_ops must never become larger than om_ntrans */ op_metrics->om_ntrans += max(req->rq_ntrans, 1); op_metrics->om_timeouts += task->tk_timeouts; op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; backlog = 0; if (ktime_to_ns(req->rq_xtime)) { backlog = ktime_sub(req->rq_xtime, task->tk_start); op_metrics->om_queue = ktime_add(op_metrics->om_queue, backlog); } op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); execute = ktime_sub(now, task->tk_start); op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute); if (task->tk_status < 0) op_metrics->om_error_status++; spin_unlock(&op_metrics->om_lock); trace_rpc_stats_latency(req->rq_task, backlog, req->rq_rtt, execute); } EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics); /** * rpc_count_iostats - tally up per-task stats * @task: completed rpc_task * @stats: array of stat structures * * Uses the statidx from @task */ void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats) { rpc_count_iostats_metrics(task, &stats[task->tk_msg.rpc_proc->p_statidx]); } EXPORT_SYMBOL_GPL(rpc_count_iostats); static void _print_name(struct seq_file *seq, unsigned int op, const struct rpc_procinfo *procs) { if (procs[op].p_name) seq_printf(seq, "\t%12s: ", procs[op].p_name); else if (op == 0) seq_printf(seq, "\t NULL: "); else seq_printf(seq, "\t%12u: ", op); } static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b) { a->om_ops += b->om_ops; a->om_ntrans += b->om_ntrans; a->om_timeouts += b->om_timeouts; a->om_bytes_sent += b->om_bytes_sent; a->om_bytes_recv += b->om_bytes_recv; a->om_queue = ktime_add(a->om_queue, b->om_queue); a->om_rtt = ktime_add(a->om_rtt, b->om_rtt); a->om_execute = ktime_add(a->om_execute, b->om_execute); a->om_error_status += b->om_error_status; } static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats, int op, const struct rpc_procinfo *procs) { _print_name(seq, op, procs); seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n", stats->om_ops, stats->om_ntrans, stats->om_timeouts, stats->om_bytes_sent, stats->om_bytes_recv, ktime_to_ms(stats->om_queue), ktime_to_ms(stats->om_rtt), ktime_to_ms(stats->om_execute), stats->om_error_status); } static int do_print_stats(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *seqv) { struct seq_file *seq = seqv; xprt->ops->print_stats(xprt, seq); return 0; } void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt) { unsigned int op, maxproc = clnt->cl_maxproc; if (!clnt->cl_metrics) return; seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS); seq_printf(seq, "p/v: %u/%u (%s)\n", clnt->cl_prog, clnt->cl_vers, clnt->cl_program->name); rpc_clnt_iterate_for_each_xprt(clnt, do_print_stats, seq); seq_printf(seq, "\tper-op statistics\n"); for (op = 0; op < maxproc; op++) { struct rpc_iostats stats = {}; struct rpc_clnt *next = clnt; do { _add_rpc_iostats(&stats, &next->cl_metrics[op]); if (next == next->cl_parent) break; next = next->cl_parent; } while (next); _print_rpc_iostats(seq, &stats, op, clnt->cl_procinfo); } } EXPORT_SYMBOL_GPL(rpc_clnt_show_stats); /* * Register/unregister RPC proc files */ static inline struct proc_dir_entry * do_register(struct net *net, const char *name, void *data, const struct proc_ops *proc_ops) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc/%s\n", name); sn = net_generic(net, sunrpc_net_id); return proc_create_data(name, 0, sn->proc_net_rpc, proc_ops, data); } struct proc_dir_entry * rpc_proc_register(struct net *net, struct rpc_stat *statp) { return do_register(net, statp->program->name, statp, &rpc_proc_ops); } EXPORT_SYMBOL_GPL(rpc_proc_register); void rpc_proc_unregister(struct net *net, const char *name) { struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { return do_register(net, statp->program->pg_name, net, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); void svc_proc_unregister(struct net *net, const char *name) { struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(svc_proc_unregister); int rpc_proc_init(struct net *net) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc\n"); sn = net_generic(net, sunrpc_net_id); sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net); if (sn->proc_net_rpc == NULL) return -ENOMEM; return 0; } void rpc_proc_exit(struct net *net) { dprintk("RPC: unregistering /proc/net/rpc\n"); remove_proc_entry("rpc", net->proc_net); }
196 196 196 197 95 111 197 197 197 197 197 111 119 1 20 197 197 197 272 197 1 80 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> #include <linux/kstrtox.h> #include <linux/errno.h> #include <linux/wait.h> #include <linux/unistd.h> #include <linux/stddef.h> #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/context_tracking.h> #include <linux/entry-common.h> #include <linux/syscalls.h> #include <linux/rseq.h> #include <asm/processor.h> #include <asm/ucontext.h> #include <asm/fpu/signal.h> #include <asm/fpu/xstate.h> #include <asm/vdso.h> #include <asm/mce.h> #include <asm/sighandling.h> #include <asm/vm86.h> #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> #include <asm/shstk.h> static inline int is_ia32_compat_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_IA32_EMULATION) && ksig->ka.sa.sa_flags & SA_IA32_ABI; } static inline int is_ia32_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); } static inline int is_x32_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_X86_X32_ABI) && ksig->ka.sa.sa_flags & SA_X32_ABI; } /* * Enable all pkeys temporarily, so as to ensure that both the current * execution stack as well as the alternate signal stack are writeable. * The application can use any of the available pkeys to protect the * alternate signal stack, and we don't know which one it is, so enable * all. The PKRU register will be reset to init_pkru later in the flow, * in fpu__clear_user_states(), and it is the application's responsibility * to enable the appropriate pkey as the first step in the signal handler * so that the handler does not segfault. */ static inline u32 sig_prepare_pkru(void) { u32 orig_pkru = read_pkru(); write_pkru(0); return orig_pkru; } /* * Set up a signal frame. */ /* x86 ABI requires 16-byte alignment */ #define FRAME_ALIGNMENT 16UL #define MAX_FRAME_PADDING (FRAME_ALIGNMENT - 1) /* * Determine which stack to use.. */ void __user * get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { struct k_sigaction *ka = &ksig->ka; int ia32_frame = is_ia32_frame(ksig); /* Default to using normal stack */ bool nested_altstack = on_sig_stack(regs->sp); bool entering_altstack = false; unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; u32 pkru; /* redzone */ if (!ia32_frame) sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { /* * This checks nested_altstack via sas_ss_flags(). Sensible * programs use SS_AUTODISARM, which disables that check, and * programs that don't use SS_AUTODISARM get compatible. */ if (sas_ss_flags(sp) == 0) { sp = current->sas_ss_sp + current->sas_ss_size; entering_altstack = true; } } else if (ia32_frame && !nested_altstack && regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ sp = (unsigned long) ka->sa.sa_restorer; entering_altstack = true; } sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size); *fpstate = (void __user *)sp; sp -= frame_size; if (ia32_frame) /* * Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; else sp = round_down(sp, FRAME_ALIGNMENT) - 8; /* * If we are on the alternate signal stack and would overflow it, don't. * Return an always-bogus address instead so we will die with SIGSEGV. */ if (unlikely((nested_altstack || entering_altstack) && !__on_sig_stack(sp))) { if (show_unhandled_signals && printk_ratelimit()) pr_info("%s[%d] overflowed sigaltstack\n", current->comm, task_pid_nr(current)); return (void __user *)-1L; } /* Update PKRU to enable access to the alternate signal stack. */ pkru = sig_prepare_pkru(); /* save i387 and extended state */ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) { /* * Restore PKRU to the original, user-defined value; disable * extra pkeys enabled for the alternate signal stack, if any. */ write_pkru(pkru); return (void __user *)-1L; } return (void __user *)sp; } /* * There are four different struct types for signal frame: sigframe_ia32, * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case * -- the largest size. It means the size for 64-bit apps is a bit more * than needed, but this keeps the code simple. */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) # define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32) #else # define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe) #endif /* * The FP state frame contains an XSAVE buffer which must be 64-byte aligned. * If a signal frame starts at an unaligned address, extra space is required. * This is the max alignment padding, conservatively. */ #define MAX_XSAVE_PADDING 63UL /* * The frame data is composed of the following areas and laid out as: * * ------------------------- * | alignment padding | * ------------------------- * | (f)xsave frame | * ------------------------- * | fsave header | * ------------------------- * | alignment padding | * ------------------------- * | siginfo + ucontext | * ------------------------- */ /* max_frame_size tells userspace the worst case signal stack size. */ static unsigned long __ro_after_init max_frame_size; static unsigned int __ro_after_init fpu_default_state_size; static int __init init_sigframe_size(void) { fpu_default_state_size = fpu__get_fpstate_size(); max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING; max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING; /* Userspace expects an aligned size. */ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); pr_info("max sigframe size: %lu\n", max_frame_size); return 0; } early_initcall(init_sigframe_size); unsigned long get_sigframe_size(void) { return max_frame_size; } static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ if (is_ia32_frame(ksig)) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) return ia32_setup_rt_frame(ksig, regs); else return ia32_setup_frame(ksig, regs); } else if (is_x32_frame(ksig)) { return x32_setup_rt_frame(ksig, regs); } else { return x64_setup_rt_frame(ksig, regs); } } static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { bool stepping, failed; struct fpu *fpu = &current->thread.fpu; if (v8086_mode(regs)) save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); /* Are we from a system call? */ if (syscall_get_nr(current, regs) != -1) { /* If so, check system call restarting.. */ switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; break; case -ERESTARTSYS: if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { regs->ax = -EINTR; break; } fallthrough; case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; break; } } /* * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now * so that register information in the sigcontext is correct and * then notify the tracer before entering the signal handler. */ stepping = test_thread_flag(TIF_SINGLESTEP); if (stepping) user_disable_single_step(current); failed = (setup_rt_frame(ksig, regs) < 0); if (!failed) { /* * Clear the direction flag as per the ABI for function entry. * * Clear RF when entering the signal handler, because * it might disable possible debug exception from the * signal handler. * * Clear TF for the case when it wasn't set by debugger to * avoid the recursive send_sigtrap() in SIGTRAP handler. */ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); /* * Ensure the signal handler starts with the new fpu state. */ fpu__clear_user_states(fpu); } signal_setup_done(failed, ksig, stepping); } static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { #ifdef CONFIG_IA32_EMULATION if (current->restart_block.arch_data & TS_COMPAT) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); #else return __NR_restart_syscall; #endif } /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ void arch_do_signal_or_restart(struct pt_regs *regs) { struct ksignal ksig; if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); return; } /* Did we come from a system call? */ if (syscall_get_nr(current, regs) != -1) { /* Restart the system call - no handlers present */ switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: regs->ax = get_nr_restart_syscall(regs); regs->ip -= 2; break; } } /* * If there's no signal to deliver, we just put the saved sigmask * back. */ restore_saved_sigmask(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; if (show_unhandled_signals && printk_ratelimit()) { printk("%s" "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } force_sig(SIGSEGV); } #ifdef CONFIG_DYNAMIC_SIGFRAME #ifdef CONFIG_STRICT_SIGALTSTACK_SIZE static bool strict_sigaltstack_size __ro_after_init = true; #else static bool strict_sigaltstack_size __ro_after_init = false; #endif static int __init strict_sas_size(char *arg) { return kstrtobool(arg, &strict_sigaltstack_size) == 0; } __setup("strict_sas_size", strict_sas_size); /* * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512 * exceeds that size already. As such programs might never use the * sigaltstack they just continued to work. While always checking against * the real size would be correct, this might be considered a regression. * * Therefore avoid the sanity check, unless enforced by kernel * configuration or command line option. * * When dynamic FPU features are supported, the check is also enforced when * the task has permissions to use dynamic features. Tasks which have no * permission are checked against the size of the non-dynamic feature set * if strict checking is enabled. This avoids forcing all tasks on the * system to allocate large sigaltstacks even if they are never going * to use a dynamic feature. As this is serialized via sighand::siglock * any permission request for a dynamic feature either happened already * or will see the newly install sigaltstack size in the permission checks. */ bool sigaltstack_size_valid(size_t ss_size) { unsigned long fsize = max_frame_size - fpu_default_state_size; u64 mask; lockdep_assert_held(&current->sighand->siglock); if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) return true; fsize += current->group_leader->thread.fpu.perm.__user_state_size; if (likely(ss_size > fsize)) return true; if (strict_sigaltstack_size) return ss_size > fsize; mask = current->group_leader->thread.fpu.perm.__state_perm; if (mask & XFEATURE_MASK_USER_DYNAMIC) return ss_size > fsize; return true; } #endif /* CONFIG_DYNAMIC_SIGFRAME */
30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * Unified UUID/GUID definition * * Copyright (C) 2009, 2016 Intel Corp. * Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/uuid.h> #include <linux/random.h> const guid_t guid_null; EXPORT_SYMBOL(guid_null); const uuid_t uuid_null; EXPORT_SYMBOL(uuid_null); const u8 guid_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; const u8 uuid_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; /** * generate_random_uuid - generate a random UUID * @uuid: where to put the generated UUID * * Random UUID interface * * Used to create a Boot ID or a filesystem UUID/GUID, but can be * useful for other kernel drivers. */ void generate_random_uuid(unsigned char uuid[16]) { get_random_bytes(uuid, 16); /* Set UUID version to 4 --- truly random generation */ uuid[6] = (uuid[6] & 0x0F) | 0x40; /* Set the UUID variant to DCE */ uuid[8] = (uuid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_uuid); void generate_random_guid(unsigned char guid[16]) { get_random_bytes(guid, 16); /* Set GUID version to 4 --- truly random generation */ guid[7] = (guid[7] & 0x0F) | 0x40; /* Set the GUID variant to DCE */ guid[8] = (guid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_guid); static void __uuid_gen_common(__u8 b[16]) { get_random_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } void guid_gen(guid_t *lu) { __uuid_gen_common(lu->b); /* version 4 : random generation */ lu->b[7] = (lu->b[7] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(guid_gen); void uuid_gen(uuid_t *bu) { __uuid_gen_common(bu->b); /* version 4 : random generation */ bu->b[6] = (bu->b[6] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(uuid_gen); /** * uuid_is_valid - checks if a UUID string is valid * @uuid: UUID string to check * * Description: * It checks if the UUID string is following the format: * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx * * where x is a hex digit. * * Return: true if input is valid UUID string. */ bool uuid_is_valid(const char *uuid) { unsigned int i; for (i = 0; i < UUID_STRING_LEN; i++) { if (i == 8 || i == 13 || i == 18 || i == 23) { if (uuid[i] != '-') return false; } else if (!isxdigit(uuid[i])) { return false; } } return true; } EXPORT_SYMBOL(uuid_is_valid); static int __uuid_parse(const char *uuid, __u8 b[16], const u8 ei[16]) { static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; unsigned int i; if (!uuid_is_valid(uuid)) return -EINVAL; for (i = 0; i < 16; i++) { int hi = hex_to_bin(uuid[si[i] + 0]); int lo = hex_to_bin(uuid[si[i] + 1]); b[ei[i]] = (hi << 4) | lo; } return 0; } int guid_parse(const char *uuid, guid_t *u) { return __uuid_parse(uuid, u->b, guid_index); } EXPORT_SYMBOL(guid_parse); int uuid_parse(const char *uuid, uuid_t *u) { return __uuid_parse(uuid, u->b, uuid_index); } EXPORT_SYMBOL(uuid_parse);
54 54 115 115 543 543 338 12 19 32 542 543 338 10 34 63 63 63 63 63 428 428 428 428 424 4 144 145 84 144 145 115 114 115 115 86 30 3 115 115 112 2 57 57 57 54 26 22 3 1 22 54 20 8 4 20 17 5 12 127 127 12 115 127 12 12 12 6 1 1 4 4 4 4 2 3 2 23 23 2 11 35 1 1 1 1 23 6 3 2 25 24 2 24 25 25 24 2 1 24 23 2 24 2 25 21 3 33 33 1 25 25 25 13 11 1 2 25 7 15 5 3 4 60 58 2 55 4 41 16 45 2 47 3 42 5 5 5 9 47 2 40 10 14 4 4 19 2 1 6 2 11 8 8 1 17 17 2 1 1 1 1 7 7 11 11 4 11 2 5 4 3 16 16 8 16 229 228 227 2 230 230 16 192 5 32 222 16 10 16 7 16 229 10 37 17 1 1 1010 1007 1005 534 423 45 5 614 166 3 213 239 45 45 88 145 37 36 167 170 3 170 168 3 10 170 170 166 22 169 168 4 6 167 5 169 8 10 10 8 10 10 10 10 10 10 6 10 10 10 161 160 161 977 977 1102 1105 169 30 8 8 8 3 3 3 573 572 573 572 573 586 181 478 478 475 477 477 478 9 572 573 572 8 8 2 2 3 1 3 3 3 3 11 11 11 11 11 4 11 8 8 11 464 464 180 463 462 180 46 46 46 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 IP device support routines. * * Derived from the IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: * Alexey Kuznetsov: pa_* fields are replaced with ifaddr * lists. * Cyrus Durgin: updated for kmod * Matthias Andree: in devinet_ioctl, compare label and * address (4.4BSD alias style support), * fall back to comparing just the label * if no match found. */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/slab.h> #include <linux/hash.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/kmod.h> #include <linux/netconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/addrconf.h> #define IPV6ONLY_FLAGS \ (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \ IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \ IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY) static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; static struct ipv4_devconf ipv4_devconf_dflt = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; #define IPV4_DEVCONF_DFLT(net, attr) \ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .type = NLA_U32 }, [IFA_RT_PRIORITY] = { .type = NLA_U32 }, [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFA_PROTO] = { .type = NLA_U8 }, }; struct inet_fill_args { u32 portid; u32 seq; int event; unsigned int flags; int netnsid; int ifindex; }; #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) static u32 inet_addr_hash(const struct net *net, __be32 addr) { u32 val = __ipv4_addr_hash(addr, net_hash_mix(net)); return hash_32(val, IN4_ADDR_HSIZE_SHIFT); } static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { u32 hash = inet_addr_hash(net, ifa->ifa_local); ASSERT_RTNL(); hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); } static void inet_hash_remove(struct in_ifaddr *ifa) { ASSERT_RTNL(); hlist_del_init_rcu(&ifa->addr_lst); } /** * __ip_dev_find - find the first device with a given source address. * @net: the net namespace * @addr: the source address * @devref: if true, take a reference on the found device * * If a caller uses devref=false, it should be protected by RCU, or RTNL */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); ifa = inet_lookup_ifaddr_rcu(net, addr); if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; /* Fallback to FIB local table so that communication * over loopback subnets work. */ local = fib_get_table(net, RT_TABLE_LOCAL); if (local && !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); } else { result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL(__ip_dev_find); /* called under RCU lock */ struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) { u32 hash = inet_addr_hash(net, addr); struct in_ifaddr *ifa; hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst) if (ifa->ifa_local == addr) return ifa; return NULL; } static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); static void devinet_sysctl_unregister(struct in_device *idev); #else static int devinet_sysctl_register(struct in_device *idev) { return 0; } static void devinet_sysctl_unregister(struct in_device *idev) { } #endif /* Locks all the inet devices. */ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) { struct in_ifaddr *ifa; ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT); if (!ifa) return NULL; in_dev_hold(in_dev); ifa->ifa_dev = in_dev; INIT_HLIST_NODE(&ifa->addr_lst); return ifa; } static void inet_rcu_free_ifa(struct rcu_head *head) { struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); in_dev_put(ifa->ifa_dev); kfree(ifa); } static void inet_free_ifa(struct in_ifaddr *ifa) { /* Our reference to ifa->ifa_dev must be freed ASAP * to release the reference to the netdev the same way. * in_dev_put() -> in_dev_finish_destroy() -> netdev_put() */ call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa); } static void in_dev_free_rcu(struct rcu_head *head) { struct in_device *idev = container_of(head, struct in_device, rcu_head); kfree(rcu_dereference_protected(idev->mc_hash, 1)); kfree(idev); } void in_dev_finish_destroy(struct in_device *idev) { struct net_device *dev = idev->dev; WARN_ON(idev->ifa_list); WARN_ON(idev->mc_list); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); #endif netdev_put(dev, &idev->dev_tracker); if (!idev->dead) pr_err("Freeing alive in_device %p\n", idev); else call_rcu(&idev->rcu_head, in_dev_free_rcu); } EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; int err = -ENOMEM; ASSERT_RTNL(); in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL); if (!in_dev) goto out; memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt, sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl); if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) dev_disable_lro(dev); /* Reference in_dev->dev */ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL); /* Account for reference dev->ip_ptr (below) */ refcount_set(&in_dev->refcnt, 1); if (dev != blackhole_netdev) { err = devinet_sysctl_register(in_dev); if (err) { in_dev->dead = 1; neigh_parms_release(&arp_tbl, in_dev->arp_parms); in_dev_put(in_dev); in_dev = NULL; goto out; } ip_mc_init_dev(in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); } /* we can receive as soon as ip_ptr is set -- do this last */ rcu_assign_pointer(dev->ip_ptr, in_dev); out: return in_dev ?: ERR_PTR(err); out_kfree: kfree(in_dev); in_dev = NULL; goto out; } static void inetdev_destroy(struct in_device *in_dev) { struct net_device *dev; struct in_ifaddr *ifa; ASSERT_RTNL(); dev = in_dev->dev; in_dev->dead = 1; ip_mc_destroy_dev(in_dev); while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } RCU_INIT_POINTER(dev->ip_ptr, NULL); devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); arp_ifdown(dev); in_dev_put(in_dev); } static int __init inet_blackhole_dev_init(void) { int err = 0; rtnl_lock(); if (!inetdev_init(blackhole_netdev)) err = -ENOMEM; rtnl_unlock(); return err; } late_initcall(inet_blackhole_dev_init); int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { const struct in_ifaddr *ifa; rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { rcu_read_unlock(); return 1; } } } rcu_read_unlock(); return 0; } static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa, *ifa1; struct in_ifaddr __rcu **last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); ifa1 = rtnl_dereference(*ifap); last_prim = ifap; if (in_dev->dead) goto no_promotions; /* 1. Deleting primary ifaddr forces deletion all secondaries * unless alias promotion is set **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = &ifa->ifa_next; if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { ifap1 = &ifa->ifa_next; prev_prom = ifa; continue; } if (!do_promote) { inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); inet_free_ifa(ifa); } else { promote = ifa; break; } } } /* On promotion all secondaries from subnet are changing * the primary IP, we must remove all their routes silently * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); } no_promotions: /* 2. Unlink it */ *ifap = ifa1->ifa_next; inet_hash_remove(ifa1); /* 3. Announce address deletion */ /* Send message first, then call notifier. At first sight, FIB update triggered by notifier will refer to already deleted ifaddr, that could confuse netlink listeners. It is not true: look, gated sees that route deleted and if it still thinks that ifaddr is valid, it will try to restore deleted routes... Grr. So that, this order is correct. */ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { struct in_ifaddr *next_sec; next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { struct in_ifaddr *last_sec; rcu_assign_pointer(prev_prom->ifa_next, next_sec); last_sec = rtnl_dereference(*last_prim); rcu_assign_pointer(promote->ifa_next, last_sec); rcu_assign_pointer(*last_prim, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); for (ifa = next_sec; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; fib_add_ifaddr(ifa); } } if (destroy) inet_free_ifa(ifa1); } static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); } static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; struct net *net = dev_net(in_dev->dev); struct in_validator_info ivi; struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; /* Don't set IPv6 only flags to IPv4 addresses */ ifa->ifa_flags &= ~IPV6ONLY_FLAGS; ifap = &in_dev->ifa_list; ifa1 = rtnl_dereference(*ifap); while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) { if (ifa1->ifa_local == ifa->ifa_local) { inet_free_ifa(ifa); return -EEXIST; } if (ifa1->ifa_scope != ifa->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value"); inet_free_ifa(ifa); return -EINVAL; } ifa->ifa_flags |= IFA_F_SECONDARY; } ifap = &ifa1->ifa_next; ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh * in now, before changes are committed. The rntl lock is serializing * access here, so the state should not change between a validator call * and a final notify on commit. This isn't invoked on promotion under * the assumption that validators are checking the address itself, and * not the flags. */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); if (ret) { inet_free_ifa(ifa); return ret; } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) ifap = last_primary; rcu_assign_pointer(ifa->ifa_next, *ifap); rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); return 0; } static int inet_insert_ifa(struct in_ifaddr *ifa) { if (!ifa->ifa_local) { inet_free_ifa(ifa); return 0; } return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (ipv4_is_loopback(ifa->ifa_local)) ifa->ifa_scope = RT_SCOPE_HOST; return inet_insert_ifa(ifa); } /* Caller must hold RCU or RTNL : * We dont take a reference on found in_device */ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) in_dev = rcu_dereference_rtnl(dev->ip_ptr); rcu_read_unlock(); return in_dev; } EXPORT_SYMBOL(inetdev_by_index); /* Called only from RTNL semaphored context. No locks. */ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask) { struct in_ifaddr *ifa; ASSERT_RTNL(); in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) return ifa; } return NULL; } static int ip_mc_autojoin_config(struct net *net, bool join, const struct in_ifaddr *ifa) { #if defined(CONFIG_IP_MULTICAST) struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ifa->ifa_address, .imr_ifindex = ifa->ifa_dev->dev->ifindex, }; struct sock *sk = net->ipv4.mc_autojoin_sk; int ret; ASSERT_RTNL_NET(net); lock_sock(sk); if (join) ret = ip_mc_join_group(sk, &mreq); else ret = ip_mc_leave_group(sk, &mreq); release_sock(sk); return ret; #else return -EOPNOTSUPP; #endif } static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; struct in_ifaddr *ifa; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) goto out; ifm = nlmsg_data(nlh); rtnl_net_lock(net); in_dev = inetdev_by_index(net, ifm->ifa_index); if (!in_dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); err = -ENODEV; goto unlock; } for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) continue; if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) continue; if (tb[IFA_ADDRESS] && (ifm->ifa_prefixlen != ifa->ifa_prefixlen || !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa))) continue; if (ipv4_is_multicast(ifa->ifa_address)) ip_mc_autojoin_config(net, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); goto unlock; } NL_SET_ERR_MSG(extack, "ipv4: Address not found"); err = -EADDRNOTAVAIL; unlock: rtnl_net_unlock(net); out: return err; } static void check_lifetime(struct work_struct *work) { unsigned long now, next, next_sec, next_sched; struct in_ifaddr *ifa; struct hlist_node *n; struct net *net; int i; net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work); now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); for (i = 0; i < IN4_ADDR_HSIZE; i++) { struct hlist_head *head = &net->ipv4.inet_addr_lst[i]; bool change_needed = false; rcu_read_lock(); hlist_for_each_entry_rcu(ifa, head, addr_lst) { unsigned long age, tstamp; u32 preferred_lft; u32 valid_lft; u32 flags; flags = READ_ONCE(ifa->ifa_flags); if (flags & IFA_F_PERMANENT) continue; preferred_lft = READ_ONCE(ifa->ifa_preferred_lft); valid_lft = READ_ONCE(ifa->ifa_valid_lft); tstamp = READ_ONCE(ifa->ifa_tstamp); /* We try to batch several events at once. */ age = (now - tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (valid_lft != INFINITY_LIFE_TIME && age >= valid_lft) { change_needed = true; } else if (preferred_lft == INFINITY_LIFE_TIME) { continue; } else if (age >= preferred_lft) { if (time_before(tstamp + valid_lft * HZ, next)) next = tstamp + valid_lft * HZ; if (!(flags & IFA_F_DEPRECATED)) change_needed = true; } else if (time_before(tstamp + preferred_lft * HZ, next)) { next = tstamp + preferred_lft * HZ; } } rcu_read_unlock(); if (!change_needed) continue; rtnl_net_lock(net); hlist_for_each_entry_safe(ifa, n, head, addr_lst) { unsigned long age; if (ifa->ifa_flags & IFA_F_PERMANENT) continue; /* We try to batch several events at once. */ age = (now - ifa->ifa_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { struct in_ifaddr __rcu **ifap; struct in_ifaddr *tmp; ifap = &ifa->ifa_dev->ifa_list; tmp = rtnl_net_dereference(net, *ifap); while (tmp) { if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } ifap = &tmp->ifa_next; tmp = rtnl_net_dereference(net, *ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_preferred_lft && !(ifa->ifa_flags & IFA_F_DEPRECATED)) { ifa->ifa_flags |= IFA_F_DEPRECATED; rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } rtnl_net_unlock(net); } next_sec = round_jiffies_up(next); next_sched = next; /* If rounded timeout is accurate enough, accept it. */ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) next_sched = next_sec; now = jiffies; /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, __u32 prefered_lft) { unsigned long timeout; u32 flags; flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); timeout = addrconf_timeout_fixup(valid_lft, HZ); if (addrconf_finite_timeout(timeout)) WRITE_ONCE(ifa->ifa_valid_lft, timeout); else flags |= IFA_F_PERMANENT; timeout = addrconf_timeout_fixup(prefered_lft, HZ); if (addrconf_finite_timeout(timeout)) { if (timeout == 0) flags |= IFA_F_DEPRECATED; WRITE_ONCE(ifa->ifa_preferred_lft, timeout); } WRITE_ONCE(ifa->ifa_flags, flags); WRITE_ONCE(ifa->ifa_tstamp, jiffies); if (!ifa->ifa_cstamp) WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp); } static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack, __u32 *valid_lft, __u32 *prefered_lft) { struct ifaddrmsg *ifm = nlmsg_data(nlh); int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; if (ifm->ifa_prefixlen > 32) { NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length"); return -EINVAL; } if (!tb[IFA_LOCAL]) { NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied"); return -EINVAL; } if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; ci = nla_data(tb[IFA_CACHEINFO]); if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); return -EINVAL; } *valid_lft = ci->ifa_valid; *prefered_lft = ci->ifa_prefered; } return 0; } static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifaddrmsg *ifm = nlmsg_data(nlh); struct in_device *in_dev; struct net_device *dev; struct in_ifaddr *ifa; int err; dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; if (!dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); goto errout; } in_dev = __in_dev_get_rtnl_net(dev); err = -ENOBUFS; if (!in_dev) goto errout; ifa = inet_alloc_ifa(in_dev); if (!ifa) /* * A potential indev allocation can be left alive, it stays * assigned to its device and is destroy with it. */ goto errout; ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (!tb[IFA_ADDRESS]) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); if (tb[IFA_BROADCAST]) ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (tb[IFA_RT_PRIORITY]) ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); if (tb[IFA_PROTO]) ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]); return ifa; errout: return ERR_PTR(err); } static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1; in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) return ifa1; } return NULL; } static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { __u32 prefered_lft = INFINITY_LIFE_TIME; __u32 valid_lft = INFINITY_LIFE_TIME; struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa_existing; struct nlattr *tb[IFA_MAX + 1]; struct in_ifaddr *ifa; int ret; ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft); if (ret < 0) return ret; if (!nla_get_in_addr(tb[IFA_LOCAL])) return 0; rtnl_net_lock(net); ifa = inet_rtm_to_ifa(net, nlh, tb, extack); if (IS_ERR(ifa)) { ret = PTR_ERR(ifa); goto unlock; } ifa_existing = find_matching_ifa(net, ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); goto unlock; } } ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack); } else { u32 new_metric = ifa->ifa_rt_priority; u8 new_proto = ifa->ifa_proto; inet_free_ifa(ifa); if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "ipv4: Address already assigned"); ret = -EEXIST; goto unlock; } ifa = ifa_existing; if (ifa->ifa_rt_priority != new_metric) { fib_modify_prefix_metric(ifa, new_metric); ifa->ifa_rt_priority = new_metric; } ifa->ifa_proto = new_proto; set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); } unlock: rtnl_net_unlock(net); return ret; } /* * Determine a default network mask, based on the IP address. */ static int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) rc = 0; else { __u32 haddr = ntohl(addr); if (IN_CLASSA(haddr)) rc = 8; else if (IN_CLASSB(haddr)) rc = 16; else if (IN_CLASSC(haddr)) rc = 24; else if (IN_CLASSE(haddr)) rc = 32; } return rc; } int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; int ret = -EFAULT; int tryaddrmatch = 0; ifr->ifr_name[IFNAMSIZ - 1] = 0; /* save original address for comparison */ memcpy(&sin_orig, sin, sizeof(*sin)); colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; dev_load(net, ifr->ifr_name); switch (cmd) { case SIOCGIFADDR: /* Get interface address */ case SIOCGIFBRDADDR: /* Get the broadcast address */ case SIOCGIFDSTADDR: /* Get the destination address */ case SIOCGIFNETMASK: /* Get the netmask for the interface */ /* Note that these ioctls will not sleep, so that we do not impose a lock. One day we will be forced to put shlock here (I mean SMP) */ tryaddrmatch = (sin_orig.sin_family == AF_INET); memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; break; case SIOCSIFFLAGS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) goto out; break; default: ret = -EINVAL; goto out; } rtnl_net_lock(net); ret = -ENODEV; dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) goto done; if (colon) *colon = ':'; in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { if (tryaddrmatch) { /* Matthias Andree */ /* compare label and address (4.4BSD style) */ /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == ifa->ifa_local) { break; /* found */ } } } /* we didn't get a match, maybe the application is 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; } } ret = -EADDRNOTAVAIL; if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; switch (cmd) { case SIOCGIFADDR: /* Get interface address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_local; break; case SIOCGIFBRDADDR: /* Get the broadcast address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_broadcast; break; case SIOCGIFDSTADDR: /* Get the destination address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_address; break; case SIOCGIFNETMASK: /* Get the netmask for the interface */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_mask; break; case SIOCSIFFLAGS: if (colon) { ret = -EADDRNOTAVAIL; if (!ifa) break; ret = 0; if (!(ifr->ifr_flags & IFF_UP)) inet_del_ifa(in_dev, ifap, 1); break; } /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */ ASSERT_RTNL(); ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; if (!ifa) { ret = -ENOBUFS; if (!in_dev) break; ifa = inet_alloc_ifa(in_dev); if (!ifa) break; if (colon) memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { ret = 0; if (ifa->ifa_local == sin->sin_addr.s_addr) break; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; ifa->ifa_scope = 0; } ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; if (!(dev->flags & IFF_POINTOPOINT)) { ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); if ((dev->flags & IFF_BROADCAST) && ifa->ifa_prefixlen < 31) ifa->ifa_broadcast = ifa->ifa_address | ~ifa->ifa_mask; } else { ifa->ifa_prefixlen = 32; ifa->ifa_mask = inet_make_mask(32); } set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ret = inet_set_ifa(dev, ifa); break; case SIOCSIFBRDADDR: /* Set the broadcast address */ ret = 0; if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = sin->sin_addr.s_addr; inet_insert_ifa(ifa); } break; case SIOCSIFDSTADDR: /* Set the destination address */ ret = 0; if (ifa->ifa_address == sin->sin_addr.s_addr) break; ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; ret = 0; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_address = sin->sin_addr.s_addr; inet_insert_ifa(ifa); break; case SIOCSIFNETMASK: /* Set the netmask for the interface */ /* * The mask we set must be legal. */ ret = -EINVAL; if (bad_mask(sin->sin_addr.s_addr, 0)) break; ret = 0; if (ifa->ifa_mask != sin->sin_addr.s_addr) { __be32 old_mask = ifa->ifa_mask; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_mask = sin->sin_addr.s_addr; ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); /* See if current broadcast address matches * with current netmask, then recalculate * the broadcast address. Otherwise it's a * funny address, so don't touch it since * the user seems to know what (s)he's doing... */ if ((dev->flags & IFF_BROADCAST) && (ifa->ifa_prefixlen < 31) && (ifa->ifa_broadcast == (ifa->ifa_local|~old_mask))) { ifa->ifa_broadcast = (ifa->ifa_local | ~sin->sin_addr.s_addr); } inet_insert_ifa(ifa); } break; } done: rtnl_net_unlock(net); out: return ret; } int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; if (WARN_ON(size > sizeof(struct ifreq))) goto out; if (!in_dev) goto out; in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) { if (!buf) { done += size; continue; } if (len < size) break; memset(&ifr, 0, sizeof(struct ifreq)); strcpy(ifr.ifr_name, ifa->ifa_label); (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; if (copy_to_user(buf + done, &ifr, size)) { done = -EFAULT; break; } len -= size; done += size; } out: return done; } static __be32 in_dev_select_addr(const struct in_device *in_dev, int scope) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) return ifa->ifa_local; } return 0; } __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { const struct in_ifaddr *ifa; __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; struct net *net = dev_net(dev); int master_idx; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (min(ifa->ifa_scope, localnet_scope) > scope) continue; if (!dst || inet_ifa_match(dst, ifa)) { addr = ifa->ifa_local; break; } if (!addr) addr = ifa->ifa_local; } if (addr) goto out_unlock; no_in_dev: master_idx = l3mdev_master_ifindex_rcu(dev); /* For VRFs, the VRF device takes the place of the loopback device, * with addresses on it being preferred. Note in such cases the * loopback device will be among the devices that fail the master_idx * equality check in the loop below. */ if (master_idx && (dev = dev_get_by_index_rcu(net, master_idx)) && (in_dev = __in_dev_get_rcu(dev))) { addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } /* Not loopback addresses on loopback should be preferred in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { if (l3mdev_master_ifindex_rcu(dev) != master_idx) continue; in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } out_unlock: rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) { unsigned char localnet_scope = RT_SCOPE_HOST; const struct in_ifaddr *ifa; __be32 addr = 0; int same = 0; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { unsigned char min_scope = min(ifa->ifa_scope, localnet_scope); if (!addr && (local == ifa->ifa_local || !local) && min_scope <= scope) { addr = ifa->ifa_local; if (same) break; } if (!same) { same = (!local || inet_ifa_match(local, ifa)) && (!dst || inet_ifa_match(dst, ifa)); if (same && addr) { if (local || !dst) break; /* Is the selected addr into dst subnet? */ if (inet_ifa_match(addr, ifa)) break; /* No, then can we use new local src? */ if (min_scope <= scope) { addr = ifa->ifa_local; break; } /* search for large dst subnet for addr */ same = 0; } } } return same ? addr : 0; } /* * Confirm that local IP address exists using wildcards: * - net: netns to check, cannot be NULL * - in_dev: only on this interface, NULL=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope) { __be32 addr = 0; struct net_device *dev; if (in_dev) return confirm_addr_indev(in_dev, dst, local, scope); rcu_read_lock(); for_each_netdev_rcu(net, dev) { in_dev = __in_dev_get_rcu(dev); if (in_dev) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) break; } } rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_confirm_addr); /* * Device notifier */ int register_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_chain, nb); } EXPORT_SYMBOL(register_inetaddr_notifier); int unregister_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_notifier); int register_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(register_inetaddr_validator_notifier); int unregister_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_validator_notifier); /* Rename ifa_labels for a device name change. Make some effort to preserve * existing alias numbering and to create unique labels if possible. */ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) { struct in_ifaddr *ifa; int named = 0; in_dev_for_each_ifa_rtnl(ifa, in_dev) { char old[IFNAMSIZ], *dot; memcpy(old, ifa->ifa_label, IFNAMSIZ); memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (named++ == 0) goto skip; dot = strchr(old, ':'); if (!dot) { sprintf(old, ":%d", named); dot = old; } if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) strcat(ifa->ifa_label, dot); else strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); skip: rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rtnl(ifa, in_dev) { arp_send(ARPOP_REQUEST, ETH_P_ARP, ifa->ifa_local, dev, ifa->ifa_local, NULL, dev->dev_addr, NULL); } } /* Called only under RTNL semaphore */ static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); if (!in_dev) { if (event == NETDEV_REGISTER) { in_dev = inetdev_init(dev); if (IS_ERR(in_dev)) return notifier_from_errno(PTR_ERR(in_dev)); if (dev->flags & IFF_LOOPBACK) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } } else if (event == NETDEV_CHANGEMTU) { /* Re-enabling IP */ if (inetdev_valid_mtu(dev->mtu)) in_dev = inetdev_init(dev); } goto out; } switch (event) { case NETDEV_REGISTER: pr_debug("%s: bug\n", __func__); RCU_INIT_POINTER(dev->ip_ptr, NULL); break; case NETDEV_UP: if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { struct in_ifaddr *ifa = inet_alloc_ifa(in_dev); if (ifa) { ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; ifa->ifa_mask = inet_make_mask(8); ifa->ifa_scope = RT_SCOPE_HOST; memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); inet_insert_ifa(ifa); } } ip_mc_up(in_dev); fallthrough; case NETDEV_CHANGEADDR: if (!IN_DEV_ARP_NOTIFY(in_dev)) break; fallthrough; case NETDEV_NOTIFY_PEERS: /* Send gratuitous ARP to notify of link change */ inetdev_send_gratuitous_arp(dev, in_dev); break; case NETDEV_DOWN: ip_mc_down(in_dev); break; case NETDEV_PRE_TYPE_CHANGE: ip_mc_unmap(in_dev); break; case NETDEV_POST_TYPE_CHANGE: ip_mc_remap(in_dev); break; case NETDEV_CHANGEMTU: if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ fallthrough; case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; case NETDEV_CHANGENAME: /* Do not notify about label change, this event is * not interesting to applications using netlink. */ inetdev_changename(dev, in_dev); devinet_sysctl_unregister(in_dev); devinet_sysctl_register(in_dev); break; } out: return NOTIFY_DONE; } static struct notifier_block ip_netdev_notifier = { .notifier_call = inetdev_event, }; static size_t inet_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + nla_total_size(4) /* IFA_FLAGS */ + nla_total_size(1) /* IFA_PROTO */ + nla_total_size(4) /* IFA_RT_PRIORITY */ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } static inline u32 cstamp_delta(unsigned long cstamp) { return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, unsigned long tstamp, u32 preferred, u32 valid) { struct ifa_cacheinfo ci; ci.cstamp = cstamp_delta(cstamp); ci.tstamp = cstamp_delta(tstamp); ci.ifa_prefered = preferred; ci.ifa_valid = valid; return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); } static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa, struct inet_fill_args *args) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; unsigned long tstamp; u32 preferred, valid; u32 flags; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm), args->flags); if (!nlh) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; flags = READ_ONCE(ifa->ifa_flags); /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits. * The 32bit value is given in IFA_FLAGS attribute. */ ifm->ifa_flags = (__u8)flags; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; if (args->netnsid >= 0 && nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto nla_put_failure; tstamp = READ_ONCE(ifa->ifa_tstamp); if (!(flags & IFA_F_PERMANENT)) { preferred = READ_ONCE(ifa->ifa_preferred_lft); valid = READ_ONCE(ifa->ifa_valid_lft); if (preferred != INFINITY_LIFE_TIME) { long tval = (jiffies - tstamp) / HZ; if (preferred > tval) preferred -= tval; else preferred = 0; if (valid != INFINITY_LIFE_TIME) { if (valid > tval) valid -= tval; else valid = 0; } } } else { preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } if ((ifa->ifa_address && nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) || (ifa->ifa_local && nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) || (ifa->ifa_broadcast && nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || (ifa->ifa_proto && nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) || nla_put_u32(skb, IFA_FLAGS, flags) || (ifa->ifa_rt_priority && nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) || put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp, preferred, valid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct inet_fill_args *fillargs, struct net **tgt_net, struct sock *sk, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[IFA_MAX+1]; struct ifaddrmsg *ifm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); return -EINVAL; } fillargs->ifindex = ifm->ifa_index; if (fillargs->ifindex) { cb->answer_flags |= NLM_F_DUMP_FILTERED; fillargs->flags |= NLM_F_DUMP_FILTERED; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= IFA_MAX; ++i) { if (!tb[i]) continue; if (i == IFA_TARGET_NETNSID) { struct net *net; fillargs->netnsid = nla_get_s32(tb[i]); net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); if (IS_ERR(net)) { fillargs->netnsid = -1; NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); return PTR_ERR(net); } *tgt_net = net; } else { NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request"); return -EINVAL; } } return 0; } static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { struct in_ifaddr *ifa; int ip_idx = 0; int err; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (ip_idx < *s_ip_idx) { ip_idx++; continue; } err = inet_fill_ifaddr(skb, ifa, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); ip_idx++; } err = 0; ip_idx = 0; done: *s_ip_idx = ip_idx; return err; } /* Combine dev_addr_genid and dev_base_seq to detect changes. */ static u32 inet_base_seq(const struct net *net) { u32 res = atomic_read(&net->ipv4.dev_addr_genid) + READ_ONCE(net->dev_base_seq); /* Must not return 0 (see nl_dump_check_consistent()). * Chose a value far away from 0. */ if (!res) res = 0x80000000; return res; } static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = nlh->nlmsg_seq, .event = RTM_NEWADDR, .flags = NLM_F_MULTI, .netnsid = -1, }; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct { unsigned long ifindex; int ip_idx; } *ctx = (void *)cb->ctx; struct in_device *in_dev; struct net_device *dev; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) goto done; if (fillargs.ifindex) { dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); if (!dev) { err = -ENODEV; goto done; } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto done; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); goto done; } } cb->seq = inet_base_seq(tgt_net); for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); if (err < 0) goto done; } done: if (fillargs.netnsid >= 0) put_net(tgt_net); rcu_read_unlock(); return err; } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { struct inet_fill_args fillargs = { .portid = portid, .seq = nlh ? nlh->nlmsg_seq : 0, .event = event, .flags = 0, .netnsid = -1, }; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; net = dev_net(ifa->ifa_dev->dev); skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL); if (!skb) goto errout; err = inet_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } static size_t inet_get_link_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (!in_dev) return 0; return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ } static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; int i; if (!in_dev) return -ENODATA; nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4); if (!nla) return -EMSGSIZE; for (i = 0; i < IPV4_DEVCONF_MAX; i++) ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]); return 0; } static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = { [IFLA_INET_CONF] = { .type = NLA_NESTED }, }; static int inet_validate_link_af(const struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, inet_af_policy, extack); if (err < 0) return err; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) { int cfgid = nla_type(a); if (nla_len(a) < 4) return -EINVAL; if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX) return -EINVAL; } } return 0; } static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct in_device *in_dev = __in_dev_get_rtnl(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; if (!in_dev) return -EAFNOSUPPORT; if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) return -EINVAL; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a)); } return 0; } static int inet_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ bool all = false; if (type == NETCONFA_ALL) all = true; if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_RP_FILTER) size += nla_total_size(4); if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_BC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; } static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, const struct ipv4_devconf *devconf, u32 portid, u32 seq, int event, unsigned int flags, int type) { struct nlmsghdr *nlh; struct netconfmsg *ncm; bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; if (type == NETCONFA_ALL) all = true; ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; if (!devconf) goto out; if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_RP_FILTER) && nla_put_s32(skb, NETCONFA_RP_FILTER, IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_BC_FORWARDING) && nla_put_s32(skb, NETCONFA_BC_FORWARDING, IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF_RO(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void inet_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv4_devconf *devconf) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); } static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet_netconf_valid_get_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); if (err) return err; for (i = 0; i <= NETCONFA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETCONFA_IFINDEX: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request"); return -EINVAL; } } return 0; } static int inet_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; const struct ipv4_devconf *devconf; struct in_device *in_dev = NULL; struct net_device *dev = NULL; struct sk_buff *skb; int ifindex; int err; err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err) return err; if (!tb[NETCONFA_IFINDEX]) return -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { case NETCONFA_IFINDEX_ALL: devconf = net->ipv4.devconf_all; break; case NETCONFA_IFINDEX_DEFAULT: devconf = net->ipv4.devconf_dflt; break; default: err = -ENODEV; dev = dev_get_by_index(net, ifindex); if (dev) in_dev = in_dev_get(dev); if (!in_dev) goto errout; devconf = &in_dev->cnf; break; } err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: if (in_dev) in_dev_put(in_dev); dev_put(dev); return err; } static int inet_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct { unsigned long ifindex; unsigned int all_default; } *ctx = (void *)cb->ctx; const struct in_device *in_dev; struct net_device *dev; int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; struct netconfmsg *ncm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request"); return -EINVAL; } } rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = inet_netconf_fill_devconf(skb, dev->ifindex, &in_dev->cnf, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; } if (ctx->all_default == 0) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } if (ctx->all_default == 1) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } done: rcu_read_unlock(); return err; } #ifdef CONFIG_SYSCTL static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct in_device *in_dev; in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; } rcu_read_unlock(); } /* called with RTNL locked */ static void inet_forward_change(struct net *net) { struct net_device *dev; int on = IPV4_DEVCONF_ALL(net, FORWARDING); IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); for_each_netdev(net, dev) { struct in_device *in_dev; if (on) dev_disable_lro(dev); in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } } } static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) { if (cnf == net->ipv4.devconf_dflt) return NETCONFA_IFINDEX_DEFAULT; else if (cnf == net->ipv4.devconf_all) return NETCONFA_IFINDEX_ALL; else { struct in_device *idev = container_of(cnf, struct in_device, cnf); return idev->dev->ifindex; } } static int devinet_conf_proc(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); int new_value = *(int *)ctl->data; if (write) { struct ipv4_devconf *cnf = ctl->extra1; struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; int ifindex; set_bit(i, cnf->state); if (cnf == net->ipv4.devconf_dflt) devinet_copy_dflt_conf(net, i); if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); if (i == IPV4_DEVCONF_BC_FORWARDING - 1 && new_value != old_value) rt_cache_flush(net); if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_RP_FILTER, ifindex, cnf); } if (i == IPV4_DEVCONF_PROXY_ARP - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, ifindex, cnf); } if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, ifindex, cnf); } } return ret; } static int devinet_sysctl_forward(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; struct net *net = ctl->extra2; int ret; if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { if (!rtnl_net_trylock(net)) { /* Restore the original values before restarting */ *valp = val; *ppos = pos; return restart_syscall(); } if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); } else { struct ipv4_devconf *cnf = ctl->extra1; struct in_device *idev = container_of(cnf, struct in_device, cnf); if (*valp) dev_disable_lro(idev->dev); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, idev->dev->ifindex, cnf); } rtnl_net_unlock(net); rt_cache_flush(net); } else inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); } return ret; } static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); struct net *net = ctl->extra2; if (write && *valp != val) rt_cache_flush(net); return ret; } #define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \ { \ .procname = name, \ .data = ipv4_devconf.data + \ IPV4_DEVCONF_ ## attr - 1, \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ .extra1 = &ipv4_devconf, \ } #define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc) #define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc) #define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc) #define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush) static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table devinet_vars[IPV4_DEVCONF_MAX]; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"), DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"), DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"), DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"), DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"), DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER, "arp_evict_nocarrier"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, "force_igmp_version"), DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, "drop_unicast_in_l2_multicast"), }, }; static int __devinet_sysctl_register(struct net *net, char *dev_name, int ifindex, struct ipv4_devconf *p) { int i; struct devinet_sysctl_table *t; char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto out; for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; t->devinet_vars[i].extra2 = net; } snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name); t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars); if (!t->sysctl_header) goto free; p->sysctl = t; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, ifindex, p); return 0; free: kfree(t); out: return -ENOMEM; } static void __devinet_sysctl_unregister(struct net *net, struct ipv4_devconf *cnf, int ifindex) { struct devinet_sysctl_table *t = cnf->sysctl; if (t) { cnf->sysctl = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL); } static int devinet_sysctl_register(struct in_device *idev) { int err; if (!sysctl_dev_name_is_allowed(idev->dev->name)) return -EINVAL; err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); if (err) return err; err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, idev->dev->ifindex, &idev->cnf); if (err) neigh_sysctl_unregister(idev->arp_parms); return err; } static void devinet_sysctl_unregister(struct in_device *idev) { struct net *net = dev_net(idev->dev); __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex); neigh_sysctl_unregister(idev->arp_parms); } static struct ctl_table ctl_forward_entry[] = { { .procname = "ip_forward", .data = &ipv4_devconf.data[ IPV4_DEVCONF_FORWARDING - 1], .maxlen = sizeof(int), .mode = 0644, .proc_handler = devinet_sysctl_forward, .extra1 = &ipv4_devconf, .extra2 = &init_net, }, }; #endif static __net_init int devinet_init_net(struct net *net) { #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table *tbl; #endif struct ipv4_devconf *all, *dflt; int err; int i; err = -ENOMEM; net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!net->ipv4.inet_addr_lst) goto err_alloc_hash; all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); if (!all) goto err_alloc_all; dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); if (!dflt) goto err_alloc_dflt; #ifdef CONFIG_SYSCTL tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL); if (!tbl) goto err_alloc_ctl; tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; tbl[0].extra1 = all; tbl[0].extra2 = net; #endif if (!net_eq(net, &init_net)) { switch (net_inherit_devconf()) { case 3: /* copy from the current netns */ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, current->nsproxy->net_ns->ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 0: case 1: /* copy from init_net */ memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 2: /* use compiled values */ break; } } #ifdef CONFIG_SYSCTL err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all); if (err < 0) goto err_reg_all; err = __devinet_sysctl_register(net, "default", NETCONFA_IFINDEX_DEFAULT, dflt); if (err < 0) goto err_reg_dflt; err = -ENOMEM; forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl, ARRAY_SIZE(ctl_forward_entry)); if (!forw_hdr) goto err_reg_ctl; net->ipv4.forw_hdr = forw_hdr; #endif for (i = 0; i < IN4_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime); net->ipv4.devconf_all = all; net->ipv4.devconf_dflt = dflt; return 0; #ifdef CONFIG_SYSCTL err_reg_ctl: __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT); err_reg_dflt: __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: kfree(tbl); err_alloc_ctl: #endif kfree(dflt); err_alloc_dflt: kfree(all); err_alloc_all: kfree(net->ipv4.inet_addr_lst); err_alloc_hash: return err; } static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL const struct ctl_table *tbl; #endif cancel_delayed_work_sync(&net->ipv4.addr_chk_work); #ifdef CONFIG_SYSCTL tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, NETCONFA_IFINDEX_DEFAULT); __devinet_sysctl_unregister(net, net->ipv4.devconf_all, NETCONFA_IFINDEX_ALL); kfree(tbl); #endif kfree(net->ipv4.devconf_dflt); kfree(net->ipv4.devconf_all); kfree(net->ipv4.inet_addr_lst); } static __net_initdata struct pernet_operations devinet_ops = { .init = devinet_init_net, .exit = devinet_exit_net, }; static struct rtnl_af_ops inet_af_ops __read_mostly = { .family = AF_INET, .fill_link_af = inet_fill_link_af, .get_link_af_size = inet_get_link_af_size, .validate_link_af = inet_validate_link_af, .set_link_af = inet_set_link_af, }; static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; void __init devinet_init(void) { register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); if (rtnl_af_register(&inet_af_ops)) panic("Unable to register inet_af_ops\n"); rtnl_register_many(devinet_rtnl_msg_handlers); }
104 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/sys.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/export.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/utsname.h> #include <linux/mman.h> #include <linux/reboot.h> #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> #include <linux/kmod.h> #include <linux/ksm.h> #include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/capability.h> #include <linux/device.h> #include <linux/key.h> #include <linux/times.h> #include <linux/posix-timers.h> #include <linux/security.h> #include <linux/random.h> #include <linux/suspend.h> #include <linux/tty.h> #include <linux/signal.h> #include <linux/cn_proc.h> #include <linux/getcpu.h> #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> #include <linux/cpu.h> #include <linux/personality.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/version.h> #include <linux/ctype.h> #include <linux/syscall_user_dispatch.h> #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> #include <linux/time_namespace.h> #include <linux/binfmts.h> #include <linux/sched.h> #include <linux/sched/autogroup.h> #include <linux/sched/loadavg.h> #include <linux/sched/stat.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> #include <linux/sched/cputime.h> #include <linux/rcupdate.h> #include <linux/uidgid.h> #include <linux/cred.h> #include <linux/nospec.h> #include <linux/kmsg_dump.h> /* Move somewhere else to avoid recompiling? */ #include <generated/utsrelease.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/unistd.h> #include "uid16.h" #ifndef SET_UNALIGN_CTL # define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef GET_UNALIGN_CTL # define GET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEMU_CTL # define SET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEMU_CTL # define GET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEXC_CTL # define SET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEXC_CTL # define GET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_ENDIAN # define GET_ENDIAN(a, b) (-EINVAL) #endif #ifndef SET_ENDIAN # define SET_ENDIAN(a, b) (-EINVAL) #endif #ifndef GET_TSC_CTL # define GET_TSC_CTL(a) (-EINVAL) #endif #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif #ifndef GET_FP_MODE # define GET_FP_MODE(a) (-EINVAL) #endif #ifndef SET_FP_MODE # define SET_FP_MODE(a,b) (-EINVAL) #endif #ifndef SVE_SET_VL # define SVE_SET_VL(a) (-EINVAL) #endif #ifndef SVE_GET_VL # define SVE_GET_VL() (-EINVAL) #endif #ifndef SME_SET_VL # define SME_SET_VL(a) (-EINVAL) #endif #ifndef SME_GET_VL # define SME_GET_VL() (-EINVAL) #endif #ifndef PAC_RESET_KEYS # define PAC_RESET_KEYS(a, b) (-EINVAL) #endif #ifndef PAC_SET_ENABLED_KEYS # define PAC_SET_ENABLED_KEYS(a, b, c) (-EINVAL) #endif #ifndef PAC_GET_ENABLED_KEYS # define PAC_GET_ENABLED_KEYS(a) (-EINVAL) #endif #ifndef SET_TAGGED_ADDR_CTRL # define SET_TAGGED_ADDR_CTRL(a) (-EINVAL) #endif #ifndef GET_TAGGED_ADDR_CTRL # define GET_TAGGED_ADDR_CTRL() (-EINVAL) #endif #ifndef RISCV_V_SET_CONTROL # define RISCV_V_SET_CONTROL(a) (-EINVAL) #endif #ifndef RISCV_V_GET_CONTROL # define RISCV_V_GET_CONTROL() (-EINVAL) #endif #ifndef RISCV_SET_ICACHE_FLUSH_CTX # define RISCV_SET_ICACHE_FLUSH_CTX(a, b) (-EINVAL) #endif #ifndef PPC_GET_DEXCR_ASPECT # define PPC_GET_DEXCR_ASPECT(a, b) (-EINVAL) #endif #ifndef PPC_SET_DEXCR_ASPECT # define PPC_SET_DEXCR_ASPECT(a, b, c) (-EINVAL) #endif /* * this is where the system-wide overflow UID and GID are defined, for * architectures that now have 32-bit UID/GID but didn't in the past */ int overflowuid = DEFAULT_OVERFLOWUID; int overflowgid = DEFAULT_OVERFLOWGID; EXPORT_SYMBOL(overflowuid); EXPORT_SYMBOL(overflowgid); /* * the same as above, but for filesystems which can only store a 16-bit * UID and GID. as such, this is needed on all architectures */ int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); /* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. * * Called with rcu_read_lock, creds are safe */ static bool set_one_prio_perm(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred = __task_cred(p); if (uid_eq(pcred->uid, cred->euid) || uid_eq(pcred->euid, cred->euid)) return true; if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) return true; return false; } /* * set the priority of a task * - the caller must hold the RCU read lock */ static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; if (!set_one_prio_perm(p)) { error = -EPERM; goto out; } if (niceval < task_nice(p) && !can_nice(p, niceval)) { error = -EACCES; goto out; } no_nice = security_task_setnice(p, niceval); if (no_nice) { error = no_nice; goto out; } if (error == -ESRCH) error = 0; set_user_nice(p, niceval); out: return error; } SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) { struct task_struct *g, *p; struct user_struct *user; const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) goto out; /* normalize: avoid signed division (rounding problems) */ error = -ESRCH; if (niceval < MIN_NICE) niceval = MIN_NICE; if (niceval > MAX_NICE) niceval = MAX_NICE; rcu_read_lock(); switch (which) { case PRIO_PROCESS: if (who) p = find_task_by_vpid(who); else p = current; if (p) error = set_one_prio(p, niceval, error); break; case PRIO_PGRP: if (who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) uid = cred->uid; else if (!uid_eq(uid, cred->uid)) { user = find_user(uid); if (!user) goto out_unlock; /* No processes for this user */ } for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); } if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } out_unlock: rcu_read_unlock(); out: return error; } /* * Ugh. To avoid negative return values, "getpriority()" will * not return the normal nice-value, but a negated value that * has been offset by 20 (ie it returns 40..1 instead of -20..19) * to stay compatible. */ SYSCALL_DEFINE2(getpriority, int, which, int, who) { struct task_struct *g, *p; struct user_struct *user; const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; rcu_read_lock(); switch (which) { case PRIO_PROCESS: if (who) p = find_task_by_vpid(who); else p = current; if (p) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } break; case PRIO_PGRP: if (who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) uid = cred->uid; else if (!uid_eq(uid, cred->uid)) { user = find_user(uid); if (!user) goto out_unlock; /* No processes for this user */ } for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } } if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } out_unlock: rcu_read_unlock(); return retval; } /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) * * If you set the real gid at all, or set the effective gid to a value not * equal to the real gid, then the saved gid is set to the new effective gid. * * This makes it possible for a setgid program to completely drop its * privileges, which is often a useful assertion to make when you are doing * a security audit over a program. * * The general idea is that a program which uses just setregid() will be * 100% compatible with BSD. A program which uses just setgid() will be * 100% compatible with POSIX with saved IDs. * * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ #ifdef CONFIG_MULTIUSER long __sys_setregid(gid_t rgid, gid_t egid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kgid_t krgid, kegid; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); if ((rgid != (gid_t) -1) && !gid_valid(krgid)) return -EINVAL; if ((egid != (gid_t) -1) && !gid_valid(kegid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (rgid != (gid_t) -1) { if (gid_eq(old->gid, krgid) || gid_eq(old->egid, krgid) || ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = krgid; else goto error; } if (egid != (gid_t) -1) { if (gid_eq(old->gid, kegid) || gid_eq(old->egid, kegid) || gid_eq(old->sgid, kegid) || ns_capable_setid(old->user_ns, CAP_SETGID)) new->egid = kegid; else goto error; } if (rgid != (gid_t) -1 || (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) new->sgid = new->egid; new->fsgid = new->egid; retval = security_task_fix_setgid(new, old, LSM_SETID_RE); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { return __sys_setregid(rgid, egid); } /* * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ long __sys_setgid(gid_t gid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kgid_t kgid; kgid = make_kgid(ns, gid); if (!gid_valid(kgid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = kgid; else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) new->egid = new->fsgid = kgid; else goto error; retval = security_task_fix_setgid(new, old, LSM_SETID_ID); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE1(setgid, gid_t, gid) { return __sys_setgid(gid); } /* * change the user struct in a credentials set to match the new UID */ static int set_user(struct cred *new) { struct user_struct *new_user; new_user = alloc_uid(new->uid); if (!new_user) return -EAGAIN; free_uid(new->user); new->user = new_user; return 0; } static void flag_nproc_exceeded(struct cred *new) { if (new->ucounts == current_ucounts()) return; /* * We don't fail in case of NPROC limit excess here because too many * poorly written programs don't check set*uid() return code, assuming * it never fails if called by root. We may still enforce NPROC limit * for programs doing set*uid()+execve() by harmlessly deferring the * failure to the execve() stage. */ if (is_rlimit_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && new->user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; } /* * Unprivileged users may change the real uid to the effective uid * or vice versa. (BSD-style) * * If you set the real uid at all, or set the effective uid to a value not * equal to the real uid, then the saved uid is set to the new effective uid. * * This makes it possible for a setuid program to completely drop its * privileges, which is often a useful assertion to make when you are doing * a security audit over a program. * * The general idea is that a program which uses just setreuid() will be * 100% compatible with BSD. A program which uses just setuid() will be * 100% compatible with POSIX with saved IDs. */ long __sys_setreuid(uid_t ruid, uid_t euid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kuid_t kruid, keuid; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); if ((ruid != (uid_t) -1) && !uid_valid(kruid)) return -EINVAL; if ((euid != (uid_t) -1) && !uid_valid(keuid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(old->uid, kruid) && !uid_eq(old->euid, kruid) && !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } if (euid != (uid_t) -1) { new->euid = keuid; if (!uid_eq(old->uid, keuid) && !uid_eq(old->euid, keuid) && !uid_eq(old->suid, keuid) && !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } if (!uid_eq(new->uid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) new->suid = new->euid; new->fsuid = new->euid; retval = security_task_fix_setuid(new, old, LSM_SETID_RE); if (retval < 0) goto error; retval = set_cred_ucounts(new); if (retval < 0) goto error; flag_nproc_exceeded(new); return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { return __sys_setreuid(ruid, euid); } /* * setuid() is implemented like SysV with SAVED_IDS * * Note that SAVED_ID's is deficient in that a setuid root program * like sendmail, for example, cannot set its uid to be a normal * user and then switch back, because if you're root, setuid() sets * the saved uid too. If you don't like this, blame the bright people * in the POSIX committee and/or USG. Note that the BSD-style setreuid() * will allow a root program to temporarily drop privileges and be able to * regain them by swapping the real and effective uid. */ long __sys_setuid(uid_t uid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kuid_t kuid; kuid = make_kuid(ns, uid); if (!uid_valid(kuid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (ns_capable_setid(old->user_ns, CAP_SETUID)) { new->suid = new->uid = kuid; if (!uid_eq(kuid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { goto error; } new->fsuid = new->euid = kuid; retval = security_task_fix_setuid(new, old, LSM_SETID_ID); if (retval < 0) goto error; retval = set_cred_ucounts(new); if (retval < 0) goto error; flag_nproc_exceeded(new); return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE1(setuid, uid_t, uid) { return __sys_setuid(uid); } /* * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid(). */ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kuid_t kruid, keuid, ksuid; bool ruid_new, euid_new, suid_new; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); ksuid = make_kuid(ns, suid); if ((ruid != (uid_t) -1) && !uid_valid(kruid)) return -EINVAL; if ((euid != (uid_t) -1) && !uid_valid(keuid)) return -EINVAL; if ((suid != (uid_t) -1) && !uid_valid(ksuid)) return -EINVAL; old = current_cred(); /* check for no-op */ if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) && (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) && uid_eq(keuid, old->fsuid))) && (suid == (uid_t) -1 || uid_eq(ksuid, old->suid))) return 0; ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid); euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid); suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid); if ((ruid_new || euid_new || suid_new) && !ns_capable_setid(old->user_ns, CAP_SETUID)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(kruid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } } if (euid != (uid_t) -1) new->euid = keuid; if (suid != (uid_t) -1) new->suid = ksuid; new->fsuid = new->euid; retval = security_task_fix_setuid(new, old, LSM_SETID_RES); if (retval < 0) goto error; retval = set_cred_ucounts(new); if (retval < 0) goto error; flag_nproc_exceeded(new); return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) { return __sys_setresuid(ruid, euid, suid); } SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) { const struct cred *cred = current_cred(); int retval; uid_t ruid, euid, suid; ruid = from_kuid_munged(cred->user_ns, cred->uid); euid = from_kuid_munged(cred->user_ns, cred->euid); suid = from_kuid_munged(cred->user_ns, cred->suid); retval = put_user(ruid, ruidp); if (!retval) { retval = put_user(euid, euidp); if (!retval) return put_user(suid, suidp); } return retval; } /* * Same as above, but for rgid, egid, sgid. */ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kgid_t krgid, kegid, ksgid; bool rgid_new, egid_new, sgid_new; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); ksgid = make_kgid(ns, sgid); if ((rgid != (gid_t) -1) && !gid_valid(krgid)) return -EINVAL; if ((egid != (gid_t) -1) && !gid_valid(kegid)) return -EINVAL; if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) return -EINVAL; old = current_cred(); /* check for no-op */ if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) && (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) && gid_eq(kegid, old->fsgid))) && (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid))) return 0; rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid); egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid); sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid); if ((rgid_new || egid_new || sgid_new) && !ns_capable_setid(old->user_ns, CAP_SETGID)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (rgid != (gid_t) -1) new->gid = krgid; if (egid != (gid_t) -1) new->egid = kegid; if (sgid != (gid_t) -1) new->sgid = ksgid; new->fsgid = new->egid; retval = security_task_fix_setgid(new, old, LSM_SETID_RES); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) { return __sys_setresgid(rgid, egid, sgid); } SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) { const struct cred *cred = current_cred(); int retval; gid_t rgid, egid, sgid; rgid = from_kgid_munged(cred->user_ns, cred->gid); egid = from_kgid_munged(cred->user_ns, cred->egid); sgid = from_kgid_munged(cred->user_ns, cred->sgid); retval = put_user(rgid, rgidp); if (!retval) { retval = put_user(egid, egidp); if (!retval) retval = put_user(sgid, sgidp); } return retval; } /* * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This * is used for "access()" and for the NFS daemon (letting nfsd stay at * whatever uid it wants to). It normally shadows "euid", except when * explicitly set by setfsuid() or for access.. */ long __sys_setfsuid(uid_t uid) { const struct cred *old; struct cred *new; uid_t old_fsuid; kuid_t kuid; old = current_cred(); old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); kuid = make_kuid(old->user_ns, uid); if (!uid_valid(kuid)) return old_fsuid; new = prepare_creds(); if (!new) return old_fsuid; if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || ns_capable_setid(old->user_ns, CAP_SETUID)) { if (!uid_eq(kuid, old->fsuid)) { new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) goto change_okay; } } abort_creds(new); return old_fsuid; change_okay: commit_creds(new); return old_fsuid; } SYSCALL_DEFINE1(setfsuid, uid_t, uid) { return __sys_setfsuid(uid); } /* * Samma på svenska.. */ long __sys_setfsgid(gid_t gid) { const struct cred *old; struct cred *new; gid_t old_fsgid; kgid_t kgid; old = current_cred(); old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); kgid = make_kgid(old->user_ns, gid); if (!gid_valid(kgid)) return old_fsgid; new = prepare_creds(); if (!new) return old_fsgid; if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || ns_capable_setid(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) goto change_okay; } } abort_creds(new); return old_fsgid; change_okay: commit_creds(new); return old_fsgid; } SYSCALL_DEFINE1(setfsgid, gid_t, gid) { return __sys_setfsgid(gid); } #endif /* CONFIG_MULTIUSER */ /** * sys_getpid - return the thread group id of the current process * * Note, despite the name, this returns the tgid not the pid. The tgid and * the pid are identical unless CLONE_THREAD was specified on clone() in * which case the tgid is the same in all threads of the same group. * * This is SMP safe as current->tgid does not change. */ SYSCALL_DEFINE0(getpid) { return task_tgid_vnr(current); } /* Thread ID - the internal kernel "pid" */ SYSCALL_DEFINE0(gettid) { return task_pid_vnr(current); } /* * Accessing ->real_parent is not SMP-safe, it could * change from under us. However, we can use a stale * value of ->real_parent under rcu_read_lock(), see * release_task()->call_rcu(delayed_put_task_struct). */ SYSCALL_DEFINE0(getppid) { int pid; rcu_read_lock(); pid = task_tgid_vnr(rcu_dereference(current->real_parent)); rcu_read_unlock(); return pid; } SYSCALL_DEFINE0(getuid) { /* Only we change this so SMP safe */ return from_kuid_munged(current_user_ns(), current_uid()); } SYSCALL_DEFINE0(geteuid) { /* Only we change this so SMP safe */ return from_kuid_munged(current_user_ns(), current_euid()); } SYSCALL_DEFINE0(getgid) { /* Only we change this so SMP safe */ return from_kgid_munged(current_user_ns(), current_gid()); } SYSCALL_DEFINE0(getegid) { /* Only we change this so SMP safe */ return from_kgid_munged(current_user_ns(), current_egid()); } static void do_sys_times(struct tms *tms) { u64 tgutime, tgstime, cutime, cstime; thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; tms->tms_utime = nsec_to_clock_t(tgutime); tms->tms_stime = nsec_to_clock_t(tgstime); tms->tms_cutime = nsec_to_clock_t(cutime); tms->tms_cstime = nsec_to_clock_t(cstime); } SYSCALL_DEFINE1(times, struct tms __user *, tbuf) { if (tbuf) { struct tms tmp; do_sys_times(&tmp); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } force_successful_syscall_return(); return (long) jiffies_64_to_clock_t(get_jiffies_64()); } #ifdef CONFIG_COMPAT static compat_clock_t clock_t_to_compat_clock_t(clock_t x) { return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); } COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) { if (tbuf) { struct tms tms; struct compat_tms tmp; do_sys_times(&tms); /* Convert our struct tms to the compat version. */ tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } force_successful_syscall_return(); return compat_jiffies_to_clock_t(jiffies); } #endif /* * This needs some heavy checking ... * I just haven't the stomach for it. I also don't fully * understand sessions/pgrp etc. Let somebody who does explain it. * * OK, I think I have the protection semantics right.... this is really * only important on a multi-user system anyway, to make sure one user * can't send a signal to a process owned by another. -TYT, 12/12/91 * * !PF_FORKNOEXEC check to conform completely to POSIX. */ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; struct pid *pgrp; int err; if (!pid) pid = task_pid_vnr(group_leader); if (!pgid) pgid = pid; if (pgid < 0) return -EINVAL; rcu_read_lock(); /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM */ write_lock_irq(&tasklist_lock); err = -ESRCH; p = find_task_by_vpid(pid); if (!p) goto out; err = -EINVAL; if (!thread_group_leader(p)) goto out; if (same_thread_group(p->real_parent, group_leader)) { err = -EPERM; if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; if (!(p->flags & PF_FORKNOEXEC)) goto out; } else { err = -ESRCH; if (p != group_leader) goto out; } err = -EPERM; if (p->signal->leader) goto out; pgrp = task_pid(p); if (pgid != pid) { struct task_struct *g; pgrp = find_vpid(pgid); g = pid_task(pgrp, PIDTYPE_PGID); if (!g || task_session(g) != task_session(group_leader)) goto out; } err = security_task_setpgid(p, pgid); if (err) goto out; if (task_pgrp(p) != pgrp) change_pid(p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); rcu_read_unlock(); return err; } static int do_getpgid(pid_t pid) { struct task_struct *p; struct pid *grp; int retval; rcu_read_lock(); if (!pid) grp = task_pgrp(current); else { retval = -ESRCH; p = find_task_by_vpid(pid); if (!p) goto out; grp = task_pgrp(p); if (!grp) goto out; retval = security_task_getpgid(p); if (retval) goto out; } retval = pid_vnr(grp); out: rcu_read_unlock(); return retval; } SYSCALL_DEFINE1(getpgid, pid_t, pid) { return do_getpgid(pid); } #ifdef __ARCH_WANT_SYS_GETPGRP SYSCALL_DEFINE0(getpgrp) { return do_getpgid(0); } #endif SYSCALL_DEFINE1(getsid, pid_t, pid) { struct task_struct *p; struct pid *sid; int retval; rcu_read_lock(); if (!pid) sid = task_session(current); else { retval = -ESRCH; p = find_task_by_vpid(pid); if (!p) goto out; sid = task_session(p); if (!sid) goto out; retval = security_task_getsid(p); if (retval) goto out; } retval = pid_vnr(sid); out: rcu_read_unlock(); return retval; } static void set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); } int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); pid_t session = pid_vnr(sid); int err = -EPERM; write_lock_irq(&tasklist_lock); /* Fail if I am already a session leader */ if (group_leader->signal->leader) goto out; /* Fail if a process group id already exists that equals the * proposed session id. */ if (pid_task(sid, PIDTYPE_PGID)) goto out; group_leader->signal->leader = 1; set_special_pids(sid); proc_clear_tty(group_leader); err = session; out: write_unlock_irq(&tasklist_lock); if (err > 0) { proc_sid_connector(group_leader); sched_autogroup_create_attach(group_leader); } return err; } SYSCALL_DEFINE0(setsid) { return ksys_setsid(); } DECLARE_RWSEM(uts_sem); #ifdef COMPAT_UTS_MACHINE #define override_architecture(name) \ (personality(current->personality) == PER_LINUX32 && \ copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ sizeof(COMPAT_UTS_MACHINE))) #else #define override_architecture(name) 0 #endif /* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be * 2.6.60. */ static int override_release(char __user *release, size_t len) { int ret = 0; if (current->personality & UNAME26) { const char *rest = UTS_RELEASE; char buf[65] = { 0 }; int ndots = 0; unsigned v; size_t copy; while (*rest) { if (*rest == '.' && ++ndots >= 3) break; if (!isdigit(*rest) && *rest != '.') break; rest++; } v = LINUX_VERSION_PATCHLEVEL + 60; copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); } return ret; } SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { struct new_utsname tmp; down_read(&uts_sem); memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); if (copy_to_user(name, &tmp, sizeof(tmp))) return -EFAULT; if (override_release(name->release, sizeof(name->release))) return -EFAULT; if (override_architecture(name)) return -EFAULT; return 0; } #ifdef __ARCH_WANT_SYS_OLD_UNAME /* * Old cruft */ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) { struct old_utsname tmp; if (!name) return -EFAULT; down_read(&uts_sem); memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); if (copy_to_user(name, &tmp, sizeof(tmp))) return -EFAULT; if (override_release(name->release, sizeof(name->release))) return -EFAULT; if (override_architecture(name)) return -EFAULT; return 0; } SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { struct oldold_utsname tmp; if (!name) return -EFAULT; memset(&tmp, 0, sizeof(tmp)); down_read(&uts_sem); memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN); memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN); memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN); up_read(&uts_sem); if (copy_to_user(name, &tmp, sizeof(tmp))) return -EFAULT; if (override_architecture(name)) return -EFAULT; if (override_release(name->release, sizeof(name->release))) return -EFAULT; return 0; } #endif SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { struct new_utsname *u; add_device_randomness(tmp, len); down_write(&uts_sem); u = utsname(); memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); } return errno; } #ifdef __ARCH_WANT_SYS_GETHOSTNAME SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) { int i; struct new_utsname *u; char tmp[__NEW_UTS_LEN + 1]; if (len < 0) return -EINVAL; down_read(&uts_sem); u = utsname(); i = 1 + strlen(u->nodename); if (i > len) i = len; memcpy(tmp, u->nodename, i); up_read(&uts_sem); if (copy_to_user(name, tmp, i)) return -EFAULT; return 0; } #endif /* * Only setdomainname; getdomainname can be implemented by calling * uname() */ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { struct new_utsname *u; add_device_randomness(tmp, len); down_write(&uts_sem); u = utsname(); memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); } return errno; } /* make sure you are allowed to change @tsk limits before calling this */ static int do_prlimit(struct task_struct *tsk, unsigned int resource, struct rlimit *new_rlim, struct rlimit *old_rlim) { struct rlimit *rlim; int retval = 0; if (resource >= RLIM_NLIMITS) return -EINVAL; resource = array_index_nospec(resource, RLIM_NLIMITS); if (new_rlim) { if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open) return -EPERM; } /* Holding a refcount on tsk protects tsk->signal from disappearing. */ rlim = tsk->signal->rlim + resource; task_lock(tsk->group_leader); if (new_rlim) { /* * Keep the capable check against init_user_ns until cgroups can * contain all limits. */ if (new_rlim->rlim_max > rlim->rlim_max && !capable(CAP_SYS_RESOURCE)) retval = -EPERM; if (!retval) retval = security_task_setrlimit(tsk, resource, new_rlim); } if (!retval) { if (old_rlim) *old_rlim = *rlim; if (new_rlim) *rlim = *new_rlim; } task_unlock(tsk->group_leader); /* * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not * infinite. In case of RLIM_INFINITY the posix CPU timer code * ignores the rlimit. */ if (!retval && new_rlim && resource == RLIMIT_CPU && new_rlim->rlim_cur != RLIM_INFINITY && IS_ENABLED(CONFIG_POSIX_TIMERS)) { /* * update_rlimit_cpu can fail if the task is exiting, but there * may be other tasks in the thread group that are not exiting, * and they need their cpu timers adjusted. * * The group_leader is the last task to be released, so if we * cannot update_rlimit_cpu on it, then the entire process is * exiting and we do not need to update at all. */ update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur); } return retval; } SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit value; int ret; ret = do_prlimit(current, resource, NULL, &value); if (!ret) ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; return ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { struct rlimit r; struct compat_rlimit r32; if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit))) return -EFAULT; if (r32.rlim_cur == COMPAT_RLIM_INFINITY) r.rlim_cur = RLIM_INFINITY; else r.rlim_cur = r32.rlim_cur; if (r32.rlim_max == COMPAT_RLIM_INFINITY) r.rlim_max = RLIM_INFINITY; else r.rlim_max = r32.rlim_max; return do_prlimit(current, resource, &r, NULL); } COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { struct rlimit r; int ret; ret = do_prlimit(current, resource, NULL, &r); if (!ret) { struct compat_rlimit r32; if (r.rlim_cur > COMPAT_RLIM_INFINITY) r32.rlim_cur = COMPAT_RLIM_INFINITY; else r32.rlim_cur = r.rlim_cur; if (r.rlim_max > COMPAT_RLIM_INFINITY) r32.rlim_max = COMPAT_RLIM_INFINITY; else r32.rlim_max = r.rlim_max; if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit))) return -EFAULT; } return ret; } #endif #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT /* * Back compatibility for getrlimit. Needed for some apps. */ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit x; if (resource >= RLIM_NLIMITS) return -EINVAL; resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); if (x.rlim_cur > 0x7FFFFFFF) x.rlim_cur = 0x7FFFFFFF; if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { struct rlimit r; if (resource >= RLIM_NLIMITS) return -EINVAL; resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); r = current->signal->rlim[resource]; task_unlock(current->group_leader); if (r.rlim_cur > 0x7FFFFFFF) r.rlim_cur = 0x7FFFFFFF; if (r.rlim_max > 0x7FFFFFFF) r.rlim_max = 0x7FFFFFFF; if (put_user(r.rlim_cur, &rlim->rlim_cur) || put_user(r.rlim_max, &rlim->rlim_max)) return -EFAULT; return 0; } #endif #endif static inline bool rlim64_is_infinity(__u64 rlim64) { #if BITS_PER_LONG < 64 return rlim64 >= ULONG_MAX; #else return rlim64 == RLIM64_INFINITY; #endif } static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) { if (rlim->rlim_cur == RLIM_INFINITY) rlim64->rlim_cur = RLIM64_INFINITY; else rlim64->rlim_cur = rlim->rlim_cur; if (rlim->rlim_max == RLIM_INFINITY) rlim64->rlim_max = RLIM64_INFINITY; else rlim64->rlim_max = rlim->rlim_max; } static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) { if (rlim64_is_infinity(rlim64->rlim_cur)) rlim->rlim_cur = RLIM_INFINITY; else rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; if (rlim64_is_infinity(rlim64->rlim_max)) rlim->rlim_max = RLIM_INFINITY; else rlim->rlim_max = (unsigned long)rlim64->rlim_max; } /* rcu lock must be held */ static int check_prlimit_permission(struct task_struct *task, unsigned int flags) { const struct cred *cred = current_cred(), *tcred; bool id_match; if (current == task) return 0; tcred = __task_cred(task); id_match = (uid_eq(cred->uid, tcred->euid) && uid_eq(cred->uid, tcred->suid) && uid_eq(cred->uid, tcred->uid) && gid_eq(cred->gid, tcred->egid) && gid_eq(cred->gid, tcred->sgid) && gid_eq(cred->gid, tcred->gid)); if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) return -EPERM; return security_task_prlimit(cred, tcred, flags); } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, const struct rlimit64 __user *, new_rlim, struct rlimit64 __user *, old_rlim) { struct rlimit64 old64, new64; struct rlimit old, new; struct task_struct *tsk; unsigned int checkflags = 0; int ret; if (old_rlim) checkflags |= LSM_PRLIMIT_READ; if (new_rlim) { if (copy_from_user(&new64, new_rlim, sizeof(new64))) return -EFAULT; rlim64_to_rlim(&new64, &new); checkflags |= LSM_PRLIMIT_WRITE; } rcu_read_lock(); tsk = pid ? find_task_by_vpid(pid) : current; if (!tsk) { rcu_read_unlock(); return -ESRCH; } ret = check_prlimit_permission(tsk, checkflags); if (ret) { rcu_read_unlock(); return ret; } get_task_struct(tsk); rcu_read_unlock(); ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, old_rlim ? &old : NULL); if (!ret && old_rlim) { rlim_to_rlim64(&old, &old64); if (copy_to_user(old_rlim, &old64, sizeof(old64))) ret = -EFAULT; } put_task_struct(tsk); return ret; } SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit new_rlim; if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; return do_prlimit(current, resource, &new_rlim, NULL); } /* * It would make sense to put struct rusage in the task_struct, * except that would make the task_struct be *really big*. After * task_struct gets moved into malloc'ed memory, it would * make sense to do this. It will make moving the rest of the information * a lot simpler! (Which we're not doing right now because we're not * measuring them yet). * * When sampling multiple threads for RUSAGE_SELF, under SMP we might have * races with threads incrementing their own counters. But since word * reads are atomic, we either get new values or old values and we don't * care which for the sums. We always take the siglock to protect reading * the c* fields from p->signal from races with exit.c updating those * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. * * Locking: * We need to take the siglock for CHILDEREN, SELF and BOTH * for the cases current multithreaded, non-current single threaded * non-current multithreaded. Thread traversal is now safe with * the siglock held. * Strictly speaking, we donot need to take the siglock if we are current and * single threaded, as no one else can take our signal_struct away, no one * else can reap the children to update signal->c* counters, and no one else * can race with the signal-> fields. If we do not take any lock, the * signal-> fields could be read out of order while another thread was just * exiting. So we should place a read memory barrier when we avoid the lock. * On the writer side, write memory barrier is implied in __exit_signal * as __exit_signal releases the siglock spinlock after updating the signal-> * fields. But we don't do this yet to keep things simple. * */ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) { r->ru_nvcsw += t->nvcsw; r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; r->ru_majflt += t->maj_flt; r->ru_inblock += task_io_get_inblock(t); r->ru_oublock += task_io_get_oublock(t); } void getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; u64 tgutime, tgstime, utime, stime; unsigned long maxrss; struct mm_struct *mm; struct signal_struct *sig = p->signal; unsigned int seq = 0; retry: memset(r, 0, sizeof(*r)); utime = stime = 0; maxrss = 0; if (who == RUSAGE_THREAD) { task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = sig->maxrss; goto out_thread; } flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: utime = sig->cutime; stime = sig->cstime; r->ru_nvcsw = sig->cnvcsw; r->ru_nivcsw = sig->cnivcsw; r->ru_minflt = sig->cmin_flt; r->ru_majflt = sig->cmaj_flt; r->ru_inblock = sig->cinblock; r->ru_oublock = sig->coublock; maxrss = sig->cmaxrss; if (who == RUSAGE_CHILDREN) break; fallthrough; case RUSAGE_SELF: r->ru_nvcsw += sig->nvcsw; r->ru_nivcsw += sig->nivcsw; r->ru_minflt += sig->min_flt; r->ru_majflt += sig->maj_flt; r->ru_inblock += sig->inblock; r->ru_oublock += sig->oublock; if (maxrss < sig->maxrss) maxrss = sig->maxrss; rcu_read_lock(); __for_each_thread(sig, t) accumulate_thread_rusage(t, r); rcu_read_unlock(); break; default: BUG(); } if (need_seqretry(&sig->stats_lock, seq)) { seq = 1; goto retry; } done_seqretry_irqrestore(&sig->stats_lock, seq, flags); if (who == RUSAGE_CHILDREN) goto out_children; thread_group_cputime_adjusted(p, &tgutime, &tgstime); utime += tgutime; stime += tgstime; out_thread: mm = get_task_mm(p); if (mm) { setmax_mm_hiwater_rss(&maxrss, mm); mmput(mm); } out_children: r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ r->ru_utime = ns_to_kernel_old_timeval(utime); r->ru_stime = ns_to_kernel_old_timeval(stime); } SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) { struct rusage r; if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) return -EINVAL; getrusage(current, who, &r); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) { struct rusage r; if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) return -EINVAL; getrusage(current, who, &r); return put_compat_rusage(&r, ru); } #endif SYSCALL_DEFINE1(umask, int, mask) { mask = xchg(&current->fs->umask, mask & S_IRWXUGO); return mask; } static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { CLASS(fd, exe)(fd); struct inode *inode; int err; if (fd_empty(exe)) return -EBADF; inode = file_inode(fd_file(exe)); /* * Because the original mm->exe_file points to executable file, make * sure that this one is executable as well, to avoid breaking an * overall picture. */ if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path)) return -EACCES; err = file_permission(fd_file(exe), MAY_EXEC); if (err) return err; return replace_mm_exe_file(mm, fd_file(exe)); } /* * Check arithmetic relations of passed addresses. * * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace. */ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) { unsigned long mmap_max_addr = TASK_SIZE; int error = -EINVAL, i; static const unsigned char offsets[] = { offsetof(struct prctl_mm_map, start_code), offsetof(struct prctl_mm_map, end_code), offsetof(struct prctl_mm_map, start_data), offsetof(struct prctl_mm_map, end_data), offsetof(struct prctl_mm_map, start_brk), offsetof(struct prctl_mm_map, brk), offsetof(struct prctl_mm_map, start_stack), offsetof(struct prctl_mm_map, arg_start), offsetof(struct prctl_mm_map, arg_end), offsetof(struct prctl_mm_map, env_start), offsetof(struct prctl_mm_map, env_end), }; /* * Make sure the members are not somewhere outside * of allowed address space. */ for (i = 0; i < ARRAY_SIZE(offsets); i++) { u64 val = *(u64 *)((char *)prctl_map + offsets[i]); if ((unsigned long)val >= mmap_max_addr || (unsigned long)val < mmap_min_addr) goto out; } /* * Make sure the pairs are ordered. */ #define __prctl_check_order(__m1, __op, __m2) \ ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); if (error) goto out; #undef __prctl_check_order error = -EINVAL; /* * Neither we should allow to override limits if they set. */ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, prctl_map->start_brk, prctl_map->end_data, prctl_map->start_data)) goto out; error = 0; out: return error; } #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) { struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; unsigned long user_auxv[AT_VECTOR_SIZE]; struct mm_struct *mm = current->mm; int error; BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); if (opt == PR_SET_MM_MAP_SIZE) return put_user((unsigned int)sizeof(prctl_map), (unsigned int __user *)addr); if (data_size != sizeof(prctl_map)) return -EINVAL; if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) return -EFAULT; error = validate_prctl_map_addr(&prctl_map); if (error) return error; if (prctl_map.auxv_size) { /* * Someone is trying to cheat the auxv vector. */ if (!prctl_map.auxv || prctl_map.auxv_size > sizeof(mm->saved_auxv)) return -EINVAL; memset(user_auxv, 0, sizeof(user_auxv)); if (copy_from_user(user_auxv, (const void __user *)prctl_map.auxv, prctl_map.auxv_size)) return -EFAULT; /* Last entry must be AT_NULL as specification requires */ user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } if (prctl_map.exe_fd != (u32)-1) { /* * Check if the current user is checkpoint/restore capable. * At the time of this writing, it checks for CAP_SYS_ADMIN * or CAP_CHECKPOINT_RESTORE. * Note that a user with access to ptrace can masquerade an * arbitrary program as any executable, even setuid ones. * This may have implications in the tomoyo subsystem. */ if (!checkpoint_restore_ns_capable(current_user_ns())) return -EPERM; error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) return error; } /* * arg_lock protects concurrent updates but we still need mmap_lock for * read to exclude races with sys_brk. */ mmap_read_lock(mm); /* * We don't validate if these members are pointing to * real present VMAs because application may have correspond * VMAs already unmapped and kernel uses these members for statistics * output in procfs mostly, except * * - @start_brk/@brk which are used in do_brk_flags but kernel lookups * for VMAs when updating these members so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself */ spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; mm->end_data = prctl_map.end_data; mm->start_brk = prctl_map.start_brk; mm->brk = prctl_map.brk; mm->start_stack = prctl_map.start_stack; mm->arg_start = prctl_map.arg_start; mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus * if someone reads this member in procfs while we're * updating -- it may get partly updated results. It's * known and acceptable trade off: we leave it as is to * not introduce additional locks here making the kernel * more complex. */ if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); mmap_read_unlock(mm); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, unsigned long len) { /* * This doesn't move the auxiliary vector itself since it's pinned to * mm_struct, but it permits filling the vector with new values. It's * up to the caller to provide sane values here, otherwise userspace * tools which use this vector might be unhappy. */ unsigned long user_auxv[AT_VECTOR_SIZE] = {}; if (len > sizeof(user_auxv)) return -EINVAL; if (copy_from_user(user_auxv, (const void __user *)addr, len)) return -EFAULT; /* Make sure the last entry is always AT_NULL */ user_auxv[AT_VECTOR_SIZE - 2] = 0; user_auxv[AT_VECTOR_SIZE - 1] = 0; BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); task_lock(current); memcpy(mm->saved_auxv, user_auxv, len); task_unlock(current); return 0; } static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { struct mm_struct *mm = current->mm; struct prctl_mm_map prctl_map = { .auxv = NULL, .auxv_size = 0, .exe_fd = -1, }; struct vm_area_struct *vma; int error; if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && opt != PR_SET_MM_MAP && opt != PR_SET_MM_MAP_SIZE))) return -EINVAL; #ifdef CONFIG_CHECKPOINT_RESTORE if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) return prctl_set_mm_map(opt, (const void __user *)addr, arg4); #endif if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); if (opt == PR_SET_MM_AUXV) return prctl_set_auxv(mm, addr, arg4); if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; error = -EINVAL; /* * arg_lock protects concurrent updates of arg boundaries, we need * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation. */ mmap_read_lock(mm); vma = find_vma(mm, addr); spin_lock(&mm->arg_lock); prctl_map.start_code = mm->start_code; prctl_map.end_code = mm->end_code; prctl_map.start_data = mm->start_data; prctl_map.end_data = mm->end_data; prctl_map.start_brk = mm->start_brk; prctl_map.brk = mm->brk; prctl_map.start_stack = mm->start_stack; prctl_map.arg_start = mm->arg_start; prctl_map.arg_end = mm->arg_end; prctl_map.env_start = mm->env_start; prctl_map.env_end = mm->env_end; switch (opt) { case PR_SET_MM_START_CODE: prctl_map.start_code = addr; break; case PR_SET_MM_END_CODE: prctl_map.end_code = addr; break; case PR_SET_MM_START_DATA: prctl_map.start_data = addr; break; case PR_SET_MM_END_DATA: prctl_map.end_data = addr; break; case PR_SET_MM_START_STACK: prctl_map.start_stack = addr; break; case PR_SET_MM_START_BRK: prctl_map.start_brk = addr; break; case PR_SET_MM_BRK: prctl_map.brk = addr; break; case PR_SET_MM_ARG_START: prctl_map.arg_start = addr; break; case PR_SET_MM_ARG_END: prctl_map.arg_end = addr; break; case PR_SET_MM_ENV_START: prctl_map.env_start = addr; break; case PR_SET_MM_ENV_END: prctl_map.env_end = addr; break; default: goto out; } error = validate_prctl_map_addr(&prctl_map); if (error) goto out; switch (opt) { /* * If command line arguments and environment * are placed somewhere else on stack, we can * set them up here, ARG_START/END to setup * command line arguments and ENV_START/END * for environment. */ case PR_SET_MM_START_STACK: case PR_SET_MM_ARG_START: case PR_SET_MM_ARG_END: case PR_SET_MM_ENV_START: case PR_SET_MM_ENV_END: if (!vma) { error = -EFAULT; goto out; } } mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; mm->end_data = prctl_map.end_data; mm->start_brk = prctl_map.start_brk; mm->brk = prctl_map.brk; mm->start_stack = prctl_map.start_stack; mm->arg_start = prctl_map.arg_start; mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; error = 0; out: spin_unlock(&mm->arg_lock); mmap_read_unlock(mm); return error; } #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return put_user(me->clear_child_tid, tid_addr); } #else static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return -EINVAL; } #endif static int propagate_has_child_subreaper(struct task_struct *p, void *data) { /* * If task has has_child_subreaper - all its descendants * already have these flag too and new descendants will * inherit it on fork, skip them. * * If we've found child_reaper - skip descendants in * it's subtree as they will never get out pidns. */ if (p->signal->has_child_subreaper || is_child_reaper(task_pid(p))) return 0; p->signal->has_child_subreaper = 1; return 1; } int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which) { return -EINVAL; } int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, unsigned long ctrl) { return -EINVAL; } int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status) { return -EINVAL; } int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status) { return -EINVAL; } int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status) { return -EINVAL; } #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) #ifdef CONFIG_ANON_VMA_NAME #define ANON_VMA_NAME_MAX_LEN 80 #define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" static inline bool is_valid_name_char(char ch) { /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ return ch > 0x1f && ch < 0x7f && !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); } static int prctl_set_vma(unsigned long opt, unsigned long addr, unsigned long size, unsigned long arg) { struct mm_struct *mm = current->mm; const char __user *uname; struct anon_vma_name *anon_name = NULL; int error; switch (opt) { case PR_SET_VMA_ANON_NAME: uname = (const char __user *)arg; if (uname) { char *name, *pch; name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); if (IS_ERR(name)) return PTR_ERR(name); for (pch = name; *pch != '\0'; pch++) { if (!is_valid_name_char(*pch)) { kfree(name); return -EINVAL; } } /* anon_vma has its own copy */ anon_name = anon_vma_name_alloc(name); kfree(name); if (!anon_name) return -ENOMEM; } mmap_write_lock(mm); error = madvise_set_anon_name(mm, addr, size, anon_name); mmap_write_unlock(mm); anon_vma_name_put(anon_name); break; default: error = -EINVAL; } return error; } #else /* CONFIG_ANON_VMA_NAME */ static int prctl_set_vma(unsigned long opt, unsigned long start, unsigned long size, unsigned long arg) { return -EINVAL; } #endif /* CONFIG_ANON_VMA_NAME */ static inline unsigned long get_current_mdwe(void) { unsigned long ret = 0; if (test_bit(MMF_HAS_MDWE, &current->mm->flags)) ret |= PR_MDWE_REFUSE_EXEC_GAIN; if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags)) ret |= PR_MDWE_NO_INHERIT; return ret; } static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, unsigned long arg4, unsigned long arg5) { unsigned long current_bits; if (arg3 || arg4 || arg5) return -EINVAL; if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT)) return -EINVAL; /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */ if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) return -EINVAL; /* * EOPNOTSUPP might be more appropriate here in principle, but * existing userspace depends on EINVAL specifically. */ if (!arch_memory_deny_write_exec_supported()) return -EINVAL; current_bits = get_current_mdwe(); if (current_bits && current_bits != bits) return -EPERM; /* Cannot unset the flags */ if (bits & PR_MDWE_NO_INHERIT) set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags); if (bits & PR_MDWE_REFUSE_EXEC_GAIN) set_bit(MMF_HAS_MDWE, &current->mm->flags); return 0; } static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { if (arg2 || arg3 || arg4 || arg5) return -EINVAL; return get_current_mdwe(); } static int prctl_get_auxv(void __user *addr, unsigned long len) { struct mm_struct *mm = current->mm; unsigned long size = min_t(unsigned long, sizeof(mm->saved_auxv), len); if (size && copy_to_user(addr, mm->saved_auxv, size)) return -EFAULT; return sizeof(mm->saved_auxv); } SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { struct task_struct *me = current; unsigned char comm[sizeof(me->comm)]; long error; error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error != -ENOSYS) return error; error = 0; switch (option) { case PR_SET_PDEATHSIG: if (!valid_signal(arg2)) { error = -EINVAL; break; } me->pdeath_signal = arg2; break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: error = get_dumpable(me->mm); break; case PR_SET_DUMPABLE: if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { error = -EINVAL; break; } set_dumpable(me->mm, arg2); break; case PR_SET_UNALIGN: error = SET_UNALIGN_CTL(me, arg2); break; case PR_GET_UNALIGN: error = GET_UNALIGN_CTL(me, arg2); break; case PR_SET_FPEMU: error = SET_FPEMU_CTL(me, arg2); break; case PR_GET_FPEMU: error = GET_FPEMU_CTL(me, arg2); break; case PR_SET_FPEXC: error = SET_FPEXC_CTL(me, arg2); break; case PR_GET_FPEXC: error = GET_FPEXC_CTL(me, arg2); break; case PR_GET_TIMING: error = PR_TIMING_STATISTICAL; break; case PR_SET_TIMING: if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; break; case PR_SET_NAME: comm[sizeof(me->comm) - 1] = 0; if (strncpy_from_user(comm, (char __user *)arg2, sizeof(me->comm) - 1) < 0) return -EFAULT; set_task_comm(me, comm); proc_comm_connector(me); break; case PR_GET_NAME: get_task_comm(comm, me); if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) return -EFAULT; break; case PR_GET_ENDIAN: error = GET_ENDIAN(me, arg2); break; case PR_SET_ENDIAN: error = SET_ENDIAN(me, arg2); break; case PR_GET_SECCOMP: error = prctl_get_seccomp(); break; case PR_SET_SECCOMP: error = prctl_set_seccomp(arg2, (char __user *)arg3); break; case PR_GET_TSC: error = GET_TSC_CTL(arg2); break; case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; case PR_TASK_PERF_EVENTS_DISABLE: error = perf_event_task_disable(); break; case PR_TASK_PERF_EVENTS_ENABLE: error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: if (current->timer_slack_ns > ULONG_MAX) error = ULONG_MAX; else error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (rt_or_dl_task_policy(current)) break; if (arg2 <= 0) current->timer_slack_ns = current->default_timer_slack_ns; else current->timer_slack_ns = arg2; break; case PR_MCE_KILL: if (arg4 | arg5) return -EINVAL; switch (arg2) { case PR_MCE_KILL_CLEAR: if (arg3 != 0) return -EINVAL; current->flags &= ~PF_MCE_PROCESS; break; case PR_MCE_KILL_SET: current->flags |= PF_MCE_PROCESS; if (arg3 == PR_MCE_KILL_EARLY) current->flags |= PF_MCE_EARLY; else if (arg3 == PR_MCE_KILL_LATE) current->flags &= ~PF_MCE_EARLY; else if (arg3 == PR_MCE_KILL_DEFAULT) current->flags &= ~(PF_MCE_EARLY|PF_MCE_PROCESS); else return -EINVAL; break; default: return -EINVAL; } break; case PR_MCE_KILL_GET: if (arg2 | arg3 | arg4 | arg5) return -EINVAL; if (current->flags & PF_MCE_PROCESS) error = (current->flags & PF_MCE_EARLY) ? PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; else error = PR_MCE_KILL_DEFAULT; break; case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; case PR_GET_TID_ADDRESS: error = prctl_get_tid_address(me, (int __user * __user *)arg2); break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; if (!arg2) break; walk_process_tree(me, propagate_has_child_subreaper, NULL); break; case PR_GET_CHILD_SUBREAPER: error = put_user(me->signal->is_child_subreaper, (int __user *)arg2); break; case PR_SET_NO_NEW_PRIVS: if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; task_set_no_new_privs(current); break; case PR_GET_NO_NEW_PRIVS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) set_bit(MMF_DISABLE_THP, &me->mm->flags); else clear_bit(MMF_DISABLE_THP, &me->mm->flags); mmap_write_unlock(me->mm); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: /* No longer implemented: */ return -EINVAL; case PR_SET_FP_MODE: error = SET_FP_MODE(me, arg2); break; case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; case PR_SVE_SET_VL: error = SVE_SET_VL(arg2); break; case PR_SVE_GET_VL: error = SVE_GET_VL(); break; case PR_SME_SET_VL: error = SME_SET_VL(arg2); break; case PR_SME_GET_VL: error = SME_GET_VL(); break; case PR_GET_SPECULATION_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; error = arch_prctl_spec_ctrl_get(me, arg2); break; case PR_SET_SPECULATION_CTRL: if (arg4 || arg5) return -EINVAL; error = arch_prctl_spec_ctrl_set(me, arg2, arg3); break; case PR_PAC_RESET_KEYS: if (arg3 || arg4 || arg5) return -EINVAL; error = PAC_RESET_KEYS(me, arg2); break; case PR_PAC_SET_ENABLED_KEYS: if (arg4 || arg5) return -EINVAL; error = PAC_SET_ENABLED_KEYS(me, arg2, arg3); break; case PR_PAC_GET_ENABLED_KEYS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = PAC_GET_ENABLED_KEYS(me); break; case PR_SET_TAGGED_ADDR_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; error = SET_TAGGED_ADDR_CTRL(arg2); break; case PR_GET_TAGGED_ADDR_CTRL: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = GET_TAGGED_ADDR_CTRL(); break; case PR_SET_IO_FLUSHER: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (arg3 || arg4 || arg5) return -EINVAL; if (arg2 == 1) current->flags |= PR_IO_FLUSHER; else if (!arg2) current->flags &= ~PR_IO_FLUSHER; else return -EINVAL; break; case PR_GET_IO_FLUSHER: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; case PR_SET_SYSCALL_USER_DISPATCH: error = set_syscall_user_dispatch(arg2, arg3, arg4, (char __user *) arg5); break; #ifdef CONFIG_SCHED_CORE case PR_SCHED_CORE: error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif case PR_SET_MDWE: error = prctl_set_mdwe(arg2, arg3, arg4, arg5); break; case PR_GET_MDWE: error = prctl_get_mdwe(arg2, arg3, arg4, arg5); break; case PR_PPC_GET_DEXCR: if (arg3 || arg4 || arg5) return -EINVAL; error = PPC_GET_DEXCR_ASPECT(me, arg2); break; case PR_PPC_SET_DEXCR: if (arg4 || arg5) return -EINVAL; error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3); break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; case PR_GET_AUXV: if (arg4 || arg5) return -EINVAL; error = prctl_get_auxv((void __user *)arg2, arg3); break; #ifdef CONFIG_KSM case PR_SET_MEMORY_MERGE: if (arg3 || arg4 || arg5) return -EINVAL; if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) error = ksm_enable_merge_any(me->mm); else error = ksm_disable_merge_any(me->mm); mmap_write_unlock(me->mm); break; case PR_GET_MEMORY_MERGE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); break; #endif case PR_RISCV_V_SET_CONTROL: error = RISCV_V_SET_CONTROL(arg2); break; case PR_RISCV_V_GET_CONTROL: error = RISCV_V_GET_CONTROL(); break; case PR_RISCV_SET_ICACHE_FLUSH_CTX: error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); break; case PR_GET_SHADOW_STACK_STATUS: if (arg3 || arg4 || arg5) return -EINVAL; error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2); break; case PR_SET_SHADOW_STACK_STATUS: if (arg3 || arg4 || arg5) return -EINVAL; error = arch_set_shadow_stack_status(me, arg2); break; case PR_LOCK_SHADOW_STACK_STATUS: if (arg3 || arg4 || arg5) return -EINVAL; error = arch_lock_shadow_stack_status(me, arg2); break; default: error = -EINVAL; break; } return error; } SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, struct getcpu_cache __user *, unused) { int err = 0; int cpu = raw_smp_processor_id(); if (cpup) err |= put_user(cpu, cpup); if (nodep) err |= put_user(cpu_to_node(cpu), nodep); return err ? -EFAULT : 0; } /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill */ static int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; struct timespec64 tp; memset(info, 0, sizeof(struct sysinfo)); ktime_get_boottime_ts64(&tp); timens_add_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); info->procs = nr_threads; si_meminfo(info); si_swapinfo(info); /* * If the sum of all the available memory (i.e. ram + swap) * is less than can be stored in a 32 bit unsigned long then * we can be binary compatible with 2.2.x kernels. If not, * well, in that case 2.2.x was broken anyways... * * -Erik Andersen <andersee@debian.org> */ mem_total = info->totalram + info->totalswap; if (mem_total < info->totalram || mem_total < info->totalswap) goto out; bitcount = 0; mem_unit = info->mem_unit; while (mem_unit > 1) { bitcount++; mem_unit >>= 1; sav_total = mem_total; mem_total <<= 1; if (mem_total < sav_total) goto out; } /* * If mem_total did not overflow, multiply all memory values by * info->mem_unit and set it to 1. This leaves things compatible * with 2.2.x, and also retains compatibility with earlier 2.4.x * kernels... */ info->mem_unit = 1; info->totalram <<= bitcount; info->freeram <<= bitcount; info->sharedram <<= bitcount; info->bufferram <<= bitcount; info->totalswap <<= bitcount; info->freeswap <<= bitcount; info->totalhigh <<= bitcount; info->freehigh <<= bitcount; out: return 0; } SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) { struct sysinfo val; do_sysinfo(&val); if (copy_to_user(info, &val, sizeof(struct sysinfo))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT struct compat_sysinfo { s32 uptime; u32 loads[3]; u32 totalram; u32 freeram; u32 sharedram; u32 bufferram; u32 totalswap; u32 freeswap; u16 procs; u16 pad; u32 totalhigh; u32 freehigh; u32 mem_unit; char _f[20-2*sizeof(u32)-sizeof(int)]; }; COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) { struct sysinfo s; struct compat_sysinfo s_32; do_sysinfo(&s); /* Check to see if any memory value is too large for 32-bit and scale * down if needed */ if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { int bitcount = 0; while (s.mem_unit < PAGE_SIZE) { s.mem_unit <<= 1; bitcount++; } s.totalram >>= bitcount; s.freeram >>= bitcount; s.sharedram >>= bitcount; s.bufferram >>= bitcount; s.totalswap >>= bitcount; s.freeswap >>= bitcount; s.totalhigh >>= bitcount; s.freehigh >>= bitcount; } memset(&s_32, 0, sizeof(s_32)); s_32.uptime = s.uptime; s_32.loads[0] = s.loads[0]; s_32.loads[1] = s.loads[1]; s_32.loads[2] = s.loads[2]; s_32.totalram = s.totalram; s_32.freeram = s.freeram; s_32.sharedram = s.sharedram; s_32.bufferram = s.bufferram; s_32.totalswap = s.totalswap; s_32.freeswap = s.freeswap; s_32.procs = s.procs; s_32.totalhigh = s.totalhigh; s_32.freehigh = s.freehigh; s_32.mem_unit = s.mem_unit; if (copy_to_user(info, &s_32, sizeof(s_32))) return -EFAULT; return 0; } #endif /* CONFIG_COMPAT */
10994 1682 10990 3895 10172 3895 10623 8221 10621 8214 8219 8218 8238 8241 10636 10621 10630 10662 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 // SPDX-License-Identifier: GPL-2.0 #include <linux/debugfs.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/memblock.h> #include <linux/stacktrace.h> #include <linux/page_owner.h> #include <linux/jump_label.h> #include <linux/migrate.h> #include <linux/stackdepot.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> #include <linux/sched/clock.h> #include "internal.h" /* * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack) * to use off stack temporal storage */ #define PAGE_OWNER_STACK_DEPTH (16) struct page_owner { unsigned short order; short last_migrate_reason; gfp_t gfp_mask; depot_stack_handle_t handle; depot_stack_handle_t free_handle; u64 ts_nsec; u64 free_ts_nsec; char comm[TASK_COMM_LEN]; pid_t pid; pid_t tgid; pid_t free_pid; pid_t free_tgid; }; struct stack { struct stack_record *stack_record; struct stack *next; }; static struct stack dummy_stack; static struct stack failure_stack; static struct stack *stack_list; static DEFINE_SPINLOCK(stack_list_lock); static bool page_owner_enabled __initdata; DEFINE_STATIC_KEY_FALSE(page_owner_inited); static depot_stack_handle_t dummy_handle; static depot_stack_handle_t failure_handle; static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); static inline void set_current_in_page_owner(void) { /* * Avoid recursion. * * We might need to allocate more memory from page_owner code, so make * sure to signal it in order to avoid recursion. */ current->in_page_owner = 1; } static inline void unset_current_in_page_owner(void) { current->in_page_owner = 0; } static int __init early_page_owner_param(char *buf) { int ret = kstrtobool(buf, &page_owner_enabled); if (page_owner_enabled) stack_depot_request_early_init(); return ret; } early_param("page_owner", early_page_owner_param); static __init bool need_page_owner(void) { return page_owner_enabled; } static __always_inline depot_stack_handle_t create_dummy_stack(void) { unsigned long entries[4]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); return stack_depot_save(entries, nr_entries, GFP_KERNEL); } static noinline void register_dummy_stack(void) { dummy_handle = create_dummy_stack(); } static noinline void register_failure_stack(void) { failure_handle = create_dummy_stack(); } static noinline void register_early_stack(void) { early_handle = create_dummy_stack(); } static __init void init_page_owner(void) { if (!page_owner_enabled) return; register_dummy_stack(); register_failure_stack(); register_early_stack(); init_early_allocated_pages(); /* Initialize dummy and failure stacks and link them to stack_list */ dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle); if (dummy_stack.stack_record) refcount_set(&dummy_stack.stack_record->count, 1); if (failure_stack.stack_record) refcount_set(&failure_stack.stack_record->count, 1); dummy_stack.next = &failure_stack; stack_list = &dummy_stack; static_branch_enable(&page_owner_inited); } struct page_ext_operations page_owner_ops = { .size = sizeof(struct page_owner), .need = need_page_owner, .init = init_page_owner, .need_shared_flags = true, }; static inline struct page_owner *get_page_owner(struct page_ext *page_ext) { return page_ext_data(page_ext, &page_owner_ops); } static noinline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[PAGE_OWNER_STACK_DEPTH]; depot_stack_handle_t handle; unsigned int nr_entries; if (current->in_page_owner) return dummy_handle; set_current_in_page_owner(); nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; unset_current_in_page_owner(); return handle; } static void add_stack_record_to_list(struct stack_record *stack_record, gfp_t gfp_mask) { unsigned long flags; struct stack *stack; set_current_in_page_owner(); stack = kmalloc(sizeof(*stack), gfp_nested_mask(gfp_mask)); if (!stack) { unset_current_in_page_owner(); return; } unset_current_in_page_owner(); stack->stack_record = stack_record; stack->next = NULL; spin_lock_irqsave(&stack_list_lock, flags); stack->next = stack_list; /* * This pairs with smp_load_acquire() from function * stack_start(). This guarantees that stack_start() * will see an updated stack_list before starting to * traverse the list. */ smp_store_release(&stack_list, stack); spin_unlock_irqrestore(&stack_list_lock, flags); } static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask, int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); if (!stack_record) return; /* * New stack_record's that do not use STACK_DEPOT_FLAG_GET start * with REFCOUNT_SATURATED to catch spurious increments of their * refcount. * Since we do not use STACK_DEPOT_FLAG_GET API, let us * set a refcount of 1 ourselves. */ if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) { int old = REFCOUNT_SATURATED; if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1)) /* Add the new stack_record to our list */ add_stack_record_to_list(stack_record, gfp_mask); } refcount_add(nr_base_pages, &stack_record->count); } static void dec_stack_record_count(depot_stack_handle_t handle, int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); if (!stack_record) return; if (refcount_sub_and_test(nr_base_pages, &stack_record->count)) pr_warn("%s: refcount went to 0 for %u handle\n", __func__, handle); } static inline void __update_page_owner_handle(struct page_ext *page_ext, depot_stack_handle_t handle, unsigned short order, gfp_t gfp_mask, short last_migrate_reason, u64 ts_nsec, pid_t pid, pid_t tgid, char *comm) { int i; struct page_owner *page_owner; for (i = 0; i < (1 << order); i++) { page_owner = get_page_owner(page_ext); page_owner->handle = handle; page_owner->order = order; page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = last_migrate_reason; page_owner->pid = pid; page_owner->tgid = tgid; page_owner->ts_nsec = ts_nsec; strscpy(page_owner->comm, comm, sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_ext = page_ext_next(page_ext); } } static inline void __update_page_owner_free_handle(struct page_ext *page_ext, depot_stack_handle_t handle, unsigned short order, pid_t pid, pid_t tgid, u64 free_ts_nsec) { int i; struct page_owner *page_owner; for (i = 0; i < (1 << order); i++) { page_owner = get_page_owner(page_ext); /* Only __reset_page_owner() wants to clear the bit */ if (handle) { __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_owner->free_handle = handle; } page_owner->free_ts_nsec = free_ts_nsec; page_owner->free_pid = current->pid; page_owner->free_tgid = current->tgid; page_ext = page_ext_next(page_ext); } } void __reset_page_owner(struct page *page, unsigned short order) { struct page_ext *page_ext; depot_stack_handle_t handle; depot_stack_handle_t alloc_handle; struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; page_owner = get_page_owner(page_ext); alloc_handle = page_owner->handle; handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); __update_page_owner_free_handle(page_ext, handle, order, current->pid, current->tgid, free_ts_nsec); page_ext_put(page_ext); if (alloc_handle != early_handle) /* * early_handle is being set as a handle for all those * early allocated pages. See init_pages_in_zone(). * Since their refcount is not being incremented because * the machinery is not ready yet, we cannot decrement * their refcount either. */ dec_stack_record_count(alloc_handle, 1 << order); } noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { struct page_ext *page_ext; u64 ts_nsec = local_clock(); depot_stack_handle_t handle; handle = save_stack(gfp_mask); page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1, ts_nsec, current->pid, current->tgid, current->comm); page_ext_put(page_ext); inc_stack_record_count(handle, gfp_mask, 1 << order); } void __set_page_owner_migrate_reason(struct page *page, int reason) { struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) return; page_owner = get_page_owner(page_ext); page_owner->last_migrate_reason = reason; page_ext_put(page_ext); } void __split_page_owner(struct page *page, int old_order, int new_order) { int i; struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) return; for (i = 0; i < (1 << old_order); i++) { page_owner = get_page_owner(page_ext); page_owner->order = new_order; page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); } void __folio_copy_owner(struct folio *newfolio, struct folio *old) { int i; struct page_ext *old_ext; struct page_ext *new_ext; struct page_owner *old_page_owner; struct page_owner *new_page_owner; depot_stack_handle_t migrate_handle; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) return; new_ext = page_ext_get(&newfolio->page); if (unlikely(!new_ext)) { page_ext_put(old_ext); return; } old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); migrate_handle = new_page_owner->handle; __update_page_owner_handle(new_ext, old_page_owner->handle, old_page_owner->order, old_page_owner->gfp_mask, old_page_owner->last_migrate_reason, old_page_owner->ts_nsec, old_page_owner->pid, old_page_owner->tgid, old_page_owner->comm); /* * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio * will be freed after migration. Keep them until then as they may be * useful. */ __update_page_owner_free_handle(new_ext, 0, old_page_owner->order, old_page_owner->free_pid, old_page_owner->free_tgid, old_page_owner->free_ts_nsec); /* * We linked the original stack to the new folio, we need to do the same * for the new one and the old folio otherwise there will be an imbalance * when subtracting those pages from the stack. */ for (i = 0; i < (1 << new_page_owner->order); i++) { old_page_owner->handle = migrate_handle; old_ext = page_ext_next(old_ext); old_page_owner = get_page_owner(old_ext); } page_ext_put(new_ext); page_ext_put(old_ext); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { struct page *page; struct page_ext *page_ext; struct page_owner *page_owner; unsigned long pfn, block_end_pfn; unsigned long end_pfn = zone_end_pfn(zone); unsigned long count[MIGRATE_TYPES] = { 0, }; int pageblock_mt, page_mt; int i; /* Scan block by block. First and last block may be incomplete */ pfn = zone->zone_start_pfn; /* * Walk the zone in pageblock_nr_pages steps. If a page block spans * a zone boundary, it will be double counted between zones. This does * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { page = pfn_to_online_page(pfn); if (!page) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; } block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { /* The pageblock is online, no need to recheck. */ page = pfn_to_page(pfn); if (page_zone(page) != zone) continue; if (PageBuddy(page)) { unsigned long freepage_order; freepage_order = buddy_order_unsafe(page); if (freepage_order <= MAX_PAGE_ORDER) pfn += (1UL << freepage_order) - 1; continue; } if (PageReserved(page)) continue; page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) goto ext_put_continue; page_owner = get_page_owner(page_ext); page_mt = gfp_migratetype(page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; else count[pageblock_mt]++; pfn = block_end_pfn; page_ext_put(page_ext); break; } pfn += (1UL << page_owner->order) - 1; ext_put_continue: page_ext_put(page_ext); } } /* Print counts */ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); for (i = 0; i < MIGRATE_TYPES; i++) seq_printf(m, "%12lu ", count[i]); seq_putc(m, '\n'); } /* * Looking for memcg information and print it out */ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, struct page *page) { #ifdef CONFIG_MEMCG unsigned long memcg_data; struct mem_cgroup *memcg; bool online; char name[80]; rcu_read_lock(); memcg_data = READ_ONCE(page->memcg_data); if (!memcg_data) goto out_unlock; if (memcg_data & MEMCG_DATA_OBJEXTS) ret += scnprintf(kbuf + ret, count - ret, "Slab cache page\n"); memcg = page_memcg_check(page); if (!memcg) goto out_unlock; online = (memcg->css.flags & CSS_ONLINE); cgroup_name(memcg->css.cgroup, name, sizeof(name)); ret += scnprintf(kbuf + ret, count - ret, "Charged %sto %smemcg %s\n", PageMemcgKmem(page) ? "(via objcg) " : "", online ? "" : "offline ", name); out_unlock: rcu_read_unlock(); #endif /* CONFIG_MEMCG */ return ret; } static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_owner *page_owner, depot_stack_handle_t handle) { int ret, pageblock_mt, page_mt; char *kbuf; count = min_t(size_t, count, PAGE_SIZE); kbuf = kmalloc(count, GFP_KERNEL); if (!kbuf) return -ENOMEM; ret = scnprintf(kbuf, count, "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, page_owner->tgid, page_owner->comm, page_owner->ts_nsec); /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); page_mt = gfp_migratetype(page_owner->gfp_mask); ret += scnprintf(kbuf + ret, count - ret, "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n", pfn, migratetype_names[page_mt], pfn >> pageblock_order, migratetype_names[pageblock_mt], &page->flags); ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); if (ret >= count) goto err; if (page_owner->last_migrate_reason != -1) { ret += scnprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); } ret = print_page_owner_memcg(kbuf, count, ret, page); ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; if (copy_to_user(buf, kbuf, ret)) ret = -EFAULT; kfree(kbuf); return ret; err: kfree(kbuf); return -ENOMEM; } void __dump_page_owner(const struct page *page) { struct page_ext *page_ext = page_ext_get((void *)page); struct page_owner *page_owner; depot_stack_handle_t handle; gfp_t gfp_mask; int mt; if (unlikely(!page_ext)) { pr_alert("There is not page extension available.\n"); return; } page_owner = get_page_owner(page_ext); gfp_mask = page_owner->gfp_mask; mt = gfp_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); page_ext_put(page_ext); return; } if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) pr_alert("page_owner tracks the page as allocated\n"); else pr_alert("page_owner tracks the page as freed\n"); pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, page_owner->pid, page_owner->tgid, page_owner->comm, page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) pr_alert("page_owner allocation stack trace missing\n"); else stack_depot_print(handle); handle = READ_ONCE(page_owner->free_handle); if (!handle) { pr_alert("page_owner free stack trace missing\n"); } else { pr_alert("page last free pid %d tgid %d stack trace:\n", page_owner->free_pid, page_owner->free_tgid); stack_depot_print(handle); } if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); page_ext_put(page_ext); } static ssize_t read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long pfn; struct page *page; struct page_ext *page_ext; struct page_owner *page_owner; depot_stack_handle_t handle; if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; page = NULL; if (*ppos == 0) pfn = min_low_pfn; else pfn = *ppos; /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { /* * This temporary page_owner is required so * that we can avoid the context switches while holding * the rcu lock and copying the page owner information to * user through copy_to_user() or GFP_KERNEL allocations. */ struct page_owner page_owner_tmp; /* * If the new page is in a new MAX_ORDER_NR_PAGES area, * validate the area as existing, skip it if not */ if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { pfn += MAX_ORDER_NR_PAGES - 1; continue; } page = pfn_to_page(pfn); if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); if (freepage_order <= MAX_PAGE_ORDER) pfn += (1UL << freepage_order) - 1; continue; } page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* * Some pages could be missed by concurrent allocation or free, * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) goto ext_put_continue; page_owner = get_page_owner(page_ext); /* * Don't print "tail" pages of high-order allocations as that * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) goto ext_put_continue; /* * Access to page_ext->handle isn't synchronous so we should * be careful to access it. */ handle = READ_ONCE(page_owner->handle); if (!handle) goto ext_put_continue; /* Record the next PFN to read in the file offset */ *ppos = pfn + 1; page_owner_tmp = *page_owner; page_ext_put(page_ext); return print_page_owner(buf, count, pfn, page, &page_owner_tmp, handle); ext_put_continue: page_ext_put(page_ext); } return 0; } static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) { switch (orig) { case SEEK_SET: file->f_pos = offset; break; case SEEK_CUR: file->f_pos += offset; break; default: return -EINVAL; } return file->f_pos; } static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) { unsigned long pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); unsigned long count = 0; /* * Walk the zone in pageblock_nr_pages steps. If a page block spans * a zone boundary, it will be double counted between zones. This does * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { unsigned long block_end_pfn; if (!pfn_valid(pfn)) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; } block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); struct page_ext *page_ext; if (page_zone(page) != zone) continue; /* * To avoid having to grab zone->lock, be a little * careful when reading buddy page order. The only * danger is that we skip too much and potentially miss * some early allocated pages, which is better than * heavy lock contention. */ if (PageBuddy(page)) { unsigned long order = buddy_order_unsafe(page); if (order > 0 && order <= MAX_PAGE_ORDER) pfn += (1UL << order) - 1; continue; } if (PageReserved(page)) continue; page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) goto ext_put_continue; /* Found early allocated page */ __update_page_owner_handle(page_ext, early_handle, 0, 0, -1, local_clock(), current->pid, current->tgid, current->comm); count++; ext_put_continue: page_ext_put(page_ext); } cond_resched(); } pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", pgdat->node_id, zone->name, count); } static void init_zones_in_node(pg_data_t *pgdat) { struct zone *zone; struct zone *node_zones = pgdat->node_zones; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!populated_zone(zone)) continue; init_pages_in_zone(pgdat, zone); } } static void init_early_allocated_pages(void) { pg_data_t *pgdat; for_each_online_pgdat(pgdat) init_zones_in_node(pgdat); } static const struct file_operations proc_page_owner_operations = { .read = read_page_owner, .llseek = lseek_page_owner, }; static void *stack_start(struct seq_file *m, loff_t *ppos) { struct stack *stack; if (*ppos == -1UL) return NULL; if (!*ppos) { /* * This pairs with smp_store_release() from function * add_stack_record_to_list(), so we get a consistent * value of stack_list. */ stack = smp_load_acquire(&stack_list); m->private = stack; } else { stack = m->private; } return stack; } static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) { struct stack *stack = v; stack = stack->next; *ppos = stack ? *ppos + 1 : -1UL; m->private = stack; return stack; } static unsigned long page_owner_pages_threshold; static int stack_print(struct seq_file *m, void *v) { int i, nr_base_pages; struct stack *stack = v; unsigned long *entries; unsigned long nr_entries; struct stack_record *stack_record = stack->stack_record; if (!stack->stack_record) return 0; nr_entries = stack_record->size; entries = stack_record->entries; nr_base_pages = refcount_read(&stack_record->count) - 1; if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) return 0; for (i = 0; i < nr_entries; i++) seq_printf(m, " %pS\n", (void *)entries[i]); seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); return 0; } static void stack_stop(struct seq_file *m, void *v) { } static const struct seq_operations page_owner_stack_op = { .start = stack_start, .next = stack_next, .stop = stack_stop, .show = stack_print }; static int page_owner_stack_open(struct inode *inode, struct file *file) { return seq_open_private(file, &page_owner_stack_op, 0); } static const struct file_operations page_owner_stack_operations = { .open = page_owner_stack_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static int page_owner_threshold_get(void *data, u64 *val) { *val = READ_ONCE(page_owner_pages_threshold); return 0; } static int page_owner_threshold_set(void *data, u64 val) { WRITE_ONCE(page_owner_pages_threshold, val); return 0; } DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get, &page_owner_threshold_set, "%llu"); static int __init pageowner_init(void) { struct dentry *dir; if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } debugfs_create_file("page_owner", 0400, NULL, NULL, &proc_page_owner_operations); dir = debugfs_create_dir("page_owner_stacks", NULL); debugfs_create_file("show_stacks", 0400, dir, NULL, &page_owner_stack_operations); debugfs_create_file("count_threshold", 0600, dir, NULL, &proc_page_owner_threshold); return 0; } late_initcall(pageowner_init)
1795 1800 8 5 4 1110 1110 1791 1794 1793 1823 1823 1795 1112 1797 1794 1112 1111 1110 1110 1110 953 1796 1794 1796 1793 1794 1134 1145 1793 1799 1793 1739 94 1113 1109 1110 1110 1111 1112 1801 52 12 731 1457 1791 2 1438 1423 48 112 112 1112 1117 1116 1112 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 // SPDX-License-Identifier: GPL-2.0 /* * KFENCE guarded object allocator and fault handling. * * Copyright (C) 2020, Google LLC. */ #define pr_fmt(fmt) "kfence: " fmt #include <linux/atomic.h> #include <linux/bug.h> #include <linux/debugfs.h> #include <linux/hash.h> #include <linux/irq_work.h> #include <linux/jhash.h> #include <linux/kcsan-checks.h> #include <linux/kfence.h> #include <linux/kmemleak.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/log2.h> #include <linux/memblock.h> #include <linux/moduleparam.h> #include <linux/notifier.h> #include <linux/panic_notifier.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/sched/clock.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/string.h> #include <asm/kfence.h> #include "kfence.h" /* Disables KFENCE on the first warning assuming an irrecoverable error. */ #define KFENCE_WARN_ON(cond) \ ({ \ const bool __cond = WARN_ON(cond); \ if (unlikely(__cond)) { \ WRITE_ONCE(kfence_enabled, false); \ disabled_by_warn = true; \ } \ __cond; \ }) /* === Data ================================================================= */ static bool kfence_enabled __read_mostly; static bool disabled_by_warn __read_mostly; unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */ #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif #define MODULE_PARAM_PREFIX "kfence." static int kfence_enable_late(void); static int param_set_sample_interval(const char *val, const struct kernel_param *kp) { unsigned long num; int ret = kstrtoul(val, 0, &num); if (ret < 0) return ret; /* Using 0 to indicate KFENCE is disabled. */ if (!num && READ_ONCE(kfence_enabled)) { pr_info("disabled\n"); WRITE_ONCE(kfence_enabled, false); } *((unsigned long *)kp->arg) = num; if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING) return disabled_by_warn ? -EINVAL : kfence_enable_late(); return 0; } static int param_get_sample_interval(char *buffer, const struct kernel_param *kp) { if (!READ_ONCE(kfence_enabled)) return sprintf(buffer, "0\n"); return param_get_ulong(buffer, kp); } static const struct kernel_param_ops sample_interval_param_ops = { .set = param_set_sample_interval, .get = param_get_sample_interval, }; module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600); /* Pool usage% threshold when currently covered allocations are skipped. */ static unsigned long kfence_skip_covered_thresh __read_mostly = 75; module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); /* Allocation burst count: number of excess KFENCE allocations per sample. */ static unsigned int kfence_burst __read_mostly; module_param_named(burst, kfence_burst, uint, 0644); /* If true, use a deferrable timer. */ static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE); module_param_named(deferrable, kfence_deferrable, bool, 0444); /* If true, check all canary bytes on panic. */ static bool kfence_check_on_panic __read_mostly; module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444); /* The pool of pages used for guard pages and objects. */ char *__kfence_pool __read_mostly; EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ /* * Per-object metadata, with one-to-one mapping of object metadata to * backing pages (in __kfence_pool). */ static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0); struct kfence_metadata *kfence_metadata __read_mostly; /* * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache(). * So introduce kfence_metadata_init to initialize metadata, and then make * kfence_metadata visible after initialization is successful. This prevents * potential UAF or access to uninitialized metadata. */ static struct kfence_metadata *kfence_metadata_init __read_mostly; /* Freelist with available objects. */ static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */ /* * The static key to set up a KFENCE allocation; or if static keys are not used * to gate allocations, to avoid a load and compare if KFENCE is disabled. */ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); /* Gates the allocation, ensuring only one succeeds in a given period. */ atomic_t kfence_allocation_gate = ATOMIC_INIT(1); /* * A Counting Bloom filter of allocation coverage: limits currently covered * allocations of the same source filling up the pool. * * Assuming a range of 15%-85% unique allocations in the pool at any point in * time, the below parameters provide a probablity of 0.02-0.33 for false * positive hits respectively: * * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM */ #define ALLOC_COVERED_HNUM 2 #define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2) #define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER) #define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER) #define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1) static atomic_t alloc_covered[ALLOC_COVERED_SIZE]; /* Stack depth used to determine uniqueness of an allocation. */ #define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8) /* * Randomness for stack hashes, making the same collisions across reboots and * different machines less likely. */ static u32 stack_hash_seed __ro_after_init; /* Statistics counters for debugfs. */ enum kfence_counter_id { KFENCE_COUNTER_ALLOCATED, KFENCE_COUNTER_ALLOCS, KFENCE_COUNTER_FREES, KFENCE_COUNTER_ZOMBIES, KFENCE_COUNTER_BUGS, KFENCE_COUNTER_SKIP_INCOMPAT, KFENCE_COUNTER_SKIP_CAPACITY, KFENCE_COUNTER_SKIP_COVERED, KFENCE_COUNTER_COUNT, }; static atomic_long_t counters[KFENCE_COUNTER_COUNT]; static const char *const counter_names[] = { [KFENCE_COUNTER_ALLOCATED] = "currently allocated", [KFENCE_COUNTER_ALLOCS] = "total allocations", [KFENCE_COUNTER_FREES] = "total frees", [KFENCE_COUNTER_ZOMBIES] = "zombie allocations", [KFENCE_COUNTER_BUGS] = "total bugs", [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)", [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)", [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)", }; static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); /* === Internals ============================================================ */ static inline bool should_skip_covered(void) { unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100; return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh; } static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries) { num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH); num_entries = filter_irq_stacks(stack_entries, num_entries); return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed); } /* * Adds (or subtracts) count @val for allocation stack trace hash * @alloc_stack_hash from Counting Bloom filter. */ static void alloc_covered_add(u32 alloc_stack_hash, int val) { int i; for (i = 0; i < ALLOC_COVERED_HNUM; i++) { atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]); alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); } } /* * Returns true if the allocation stack trace hash @alloc_stack_hash is * currently contained (non-zero count) in Counting Bloom filter. */ static bool alloc_covered_contains(u32 alloc_stack_hash) { int i; for (i = 0; i < ALLOC_COVERED_HNUM; i++) { if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK])) return false; alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); } return true; } static bool kfence_protect(unsigned long addr) { return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true)); } static bool kfence_unprotect(unsigned long addr) { return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); } static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) { unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; unsigned long pageaddr = (unsigned long)&__kfence_pool[offset]; /* The checks do not affect performance; only called from slow-paths. */ /* Only call with a pointer into kfence_metadata. */ if (KFENCE_WARN_ON(meta < kfence_metadata || meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS)) return 0; /* * This metadata object only ever maps to 1 page; verify that the stored * address is in the expected range. */ if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr)) return 0; return pageaddr; } static inline bool kfence_obj_allocated(const struct kfence_metadata *meta) { enum kfence_object_state state = READ_ONCE(meta->state); return state == KFENCE_OBJECT_ALLOCATED || state == KFENCE_OBJECT_RCU_FREEING; } /* * Update the object's metadata state, including updating the alloc/free stacks * depending on the state transition. */ static noinline void metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next, unsigned long *stack_entries, size_t num_stack_entries) { struct kfence_track *track = next == KFENCE_OBJECT_ALLOCATED ? &meta->alloc_track : &meta->free_track; lockdep_assert_held(&meta->lock); /* Stack has been saved when calling rcu, skip. */ if (READ_ONCE(meta->state) == KFENCE_OBJECT_RCU_FREEING) goto out; if (stack_entries) { memcpy(track->stack_entries, stack_entries, num_stack_entries * sizeof(stack_entries[0])); } else { /* * Skip over 1 (this) functions; noinline ensures we do not * accidentally skip over the caller by never inlining. */ num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); } track->num_stack_entries = num_stack_entries; track->pid = task_pid_nr(current); track->cpu = raw_smp_processor_id(); track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ out: /* * Pairs with READ_ONCE() in * kfence_shutdown_cache(), * kfence_handle_page_fault(). */ WRITE_ONCE(meta->state, next); } #ifdef CONFIG_KMSAN #define check_canary_attributes noinline __no_kmsan_checks #else #define check_canary_attributes inline #endif /* Check canary byte at @addr. */ static check_canary_attributes bool check_canary_byte(u8 *addr) { struct kfence_metadata *meta; unsigned long flags; if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr))) return true; atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); meta = addr_to_metadata((unsigned long)addr); raw_spin_lock_irqsave(&meta->lock, flags); kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); raw_spin_unlock_irqrestore(&meta->lock, flags); return false; } static inline void set_canary(const struct kfence_metadata *meta) { const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); unsigned long addr = pageaddr; /* * The canary may be written to part of the object memory, but it does * not affect it. The user should initialize the object before using it. */ for (; addr < meta->addr; addr += sizeof(u64)) *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64; addr = ALIGN_DOWN(meta->addr + meta->size, sizeof(u64)); for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64; } static check_canary_attributes void check_canary(const struct kfence_metadata *meta) { const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); unsigned long addr = pageaddr; /* * We'll iterate over each canary byte per-side until a corrupted byte * is found. However, we'll still iterate over the canary bytes to the * right of the object even if there was an error in the canary bytes to * the left of the object. Specifically, if check_canary_byte() * generates an error, showing both sides might give more clues as to * what the error is about when displaying which bytes were corrupted. */ /* Apply to left of object. */ for (; meta->addr - addr >= sizeof(u64); addr += sizeof(u64)) { if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) break; } /* * If the canary is corrupted in a certain 64 bytes, or the canary * memory cannot be completely covered by multiple consecutive 64 bytes, * it needs to be checked one by one. */ for (; addr < meta->addr; addr++) { if (unlikely(!check_canary_byte((u8 *)addr))) break; } /* Apply to right of object. */ for (addr = meta->addr + meta->size; addr % sizeof(u64) != 0; addr++) { if (unlikely(!check_canary_byte((u8 *)addr))) return; } for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) { if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) { for (; addr - pageaddr < PAGE_SIZE; addr++) { if (!check_canary_byte((u8 *)addr)) return; } } } } static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, unsigned long *stack_entries, size_t num_stack_entries, u32 alloc_stack_hash) { struct kfence_metadata *meta = NULL; unsigned long flags; struct slab *slab; void *addr; const bool random_right_allocate = get_random_u32_below(2); const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS && !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS); /* Try to obtain a free object. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); if (!list_empty(&kfence_freelist)) { meta = list_entry(kfence_freelist.next, struct kfence_metadata, list); list_del_init(&meta->list); } raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); if (!meta) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]); return NULL; } if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) { /* * This is extremely unlikely -- we are reporting on a * use-after-free, which locked meta->lock, and the reporting * code via printk calls kmalloc() which ends up in * kfence_alloc() and tries to grab the same object that we're * reporting on. While it has never been observed, lockdep does * report that there is a possibility of deadlock. Fix it by * using trylock and bailing out gracefully. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); /* Put the object back on the freelist. */ list_add_tail(&meta->list, &kfence_freelist); raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); return NULL; } meta->addr = metadata_to_pageaddr(meta); /* Unprotect if we're reusing this page. */ if (meta->state == KFENCE_OBJECT_FREED) kfence_unprotect(meta->addr); /* * Note: for allocations made before RNG initialization, will always * return zero. We still benefit from enabling KFENCE as early as * possible, even when the RNG is not yet available, as this will allow * KFENCE to detect bugs due to earlier allocations. The only downside * is that the out-of-bounds accesses detected are deterministic for * such allocations. */ if (random_right_allocate) { /* Allocate on the "right" side, re-calculate address. */ meta->addr += PAGE_SIZE - size; meta->addr = ALIGN_DOWN(meta->addr, cache->align); } addr = (void *)meta->addr; /* Update remaining metadata. */ metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries); /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ WRITE_ONCE(meta->cache, cache); meta->size = size; meta->alloc_stack_hash = alloc_stack_hash; raw_spin_unlock_irqrestore(&meta->lock, flags); alloc_covered_add(alloc_stack_hash, 1); /* Set required slab fields. */ slab = virt_to_slab((void *)meta->addr); slab->slab_cache = cache; slab->objects = 1; /* Memory initialization. */ set_canary(meta); /* * We check slab_want_init_on_alloc() ourselves, rather than letting * SL*B do the initialization, as otherwise we might overwrite KFENCE's * redzone. */ if (unlikely(slab_want_init_on_alloc(gfp, cache))) memzero_explicit(addr, size); if (cache->ctor) cache->ctor(addr); if (random_fault) kfence_protect(meta->addr); /* Random "faults" by protecting the object. */ atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]); atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]); return addr; } static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie) { struct kcsan_scoped_access assert_page_exclusive; unsigned long flags; bool init; raw_spin_lock_irqsave(&meta->lock, flags); if (!kfence_obj_allocated(meta) || meta->addr != (unsigned long)addr) { /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_INVALID_FREE); raw_spin_unlock_irqrestore(&meta->lock, flags); return; } /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */ kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE, KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT, &assert_page_exclusive); if (CONFIG_KFENCE_STRESS_TEST_FAULTS) kfence_unprotect((unsigned long)addr); /* To check canary bytes. */ /* Restore page protection if there was an OOB access. */ if (meta->unprotected_page) { memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE); kfence_protect(meta->unprotected_page); meta->unprotected_page = 0; } /* Mark the object as freed. */ metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0); init = slab_want_init_on_free(meta->cache); raw_spin_unlock_irqrestore(&meta->lock, flags); alloc_covered_add(meta->alloc_stack_hash, -1); /* Check canary bytes for memory corruption. */ check_canary(meta); /* * Clear memory if init-on-free is set. While we protect the page, the * data is still there, and after a use-after-free is detected, we * unprotect the page, so the data is still accessible. */ if (!zombie && unlikely(init)) memzero_explicit(addr, meta->size); /* Protect to detect use-after-frees. */ kfence_protect((unsigned long)addr); kcsan_end_scoped_access(&assert_page_exclusive); if (!zombie) { /* Add it to the tail of the freelist for reuse. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); KFENCE_WARN_ON(!list_empty(&meta->list)); list_add_tail(&meta->list, &kfence_freelist); raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]); atomic_long_inc(&counters[KFENCE_COUNTER_FREES]); } else { /* See kfence_shutdown_cache(). */ atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]); } } static void rcu_guarded_free(struct rcu_head *h) { struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head); kfence_guarded_free((void *)meta->addr, meta, false); } /* * Initialization of the KFENCE pool after its allocation. * Returns 0 on success; otherwise returns the address up to * which partial initialization succeeded. */ static unsigned long kfence_init_pool(void) { unsigned long addr; struct page *pages; int i; if (!arch_kfence_init_pool()) return (unsigned long)__kfence_pool; addr = (unsigned long)__kfence_pool; pages = virt_to_page(__kfence_pool); /* * Set up object pages: they must have PG_slab set, to avoid freeing * these as real pages. * * We also want to avoid inserting kfence_free() in the kfree() * fast-path in SLUB, and therefore need to ensure kfree() correctly * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { struct slab *slab = page_slab(nth_page(pages, i)); if (!i || (i % 2)) continue; __folio_set_slab(slab_folio(slab)); #ifdef CONFIG_MEMCG slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | MEMCG_DATA_OBJEXTS; #endif } /* * Protect the first 2 pages. The first page is mostly unnecessary, and * merely serves as an extended guard page. However, adding one * additional page in the beginning gives us an even number of pages, * which simplifies the mapping of address to metadata index. */ for (i = 0; i < 2; i++) { if (unlikely(!kfence_protect(addr))) return addr; addr += PAGE_SIZE; } for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { struct kfence_metadata *meta = &kfence_metadata_init[i]; /* Initialize metadata. */ INIT_LIST_HEAD(&meta->list); raw_spin_lock_init(&meta->lock); meta->state = KFENCE_OBJECT_UNUSED; meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */ list_add_tail(&meta->list, &kfence_freelist); /* Protect the right redzone. */ if (unlikely(!kfence_protect(addr + PAGE_SIZE))) goto reset_slab; addr += 2 * PAGE_SIZE; } /* * Make kfence_metadata visible only when initialization is successful. * Otherwise, if the initialization fails and kfence_metadata is freed, * it may cause UAF in kfence_shutdown_cache(). */ smp_store_release(&kfence_metadata, kfence_metadata_init); return 0; reset_slab: for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { struct slab *slab = page_slab(nth_page(pages, i)); if (!i || (i % 2)) continue; #ifdef CONFIG_MEMCG slab->obj_exts = 0; #endif __folio_clear_slab(slab_folio(slab)); } return addr; } static bool __init kfence_init_pool_early(void) { unsigned long addr; if (!__kfence_pool) return false; addr = kfence_init_pool(); if (!addr) { /* * The pool is live and will never be deallocated from this point on. * Ignore the pool object from the kmemleak phys object tree, as it would * otherwise overlap with allocations returned by kfence_alloc(), which * are registered with kmemleak through the slab post-alloc hook. */ kmemleak_ignore_phys(__pa(__kfence_pool)); return true; } /* * Only release unprotected pages, and do not try to go back and change * page attributes due to risk of failing to do so as well. If changing * page attributes for some pages fails, it is very likely that it also * fails for the first page, and therefore expect addr==__kfence_pool in * most failure cases. */ memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE); kfence_metadata_init = NULL; return false; } /* === DebugFS Interface ==================================================== */ static int stats_show(struct seq_file *seq, void *v) { int i; seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled)); for (i = 0; i < KFENCE_COUNTER_COUNT; i++) seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); return 0; } DEFINE_SHOW_ATTRIBUTE(stats); /* * debugfs seq_file operations for /sys/kernel/debug/kfence/objects. * start_object() and next_object() return the object index + 1, because NULL is used * to stop iteration. */ static void *start_object(struct seq_file *seq, loff_t *pos) { if (*pos < CONFIG_KFENCE_NUM_OBJECTS) return (void *)((long)*pos + 1); return NULL; } static void stop_object(struct seq_file *seq, void *v) { } static void *next_object(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; if (*pos < CONFIG_KFENCE_NUM_OBJECTS) return (void *)((long)*pos + 1); return NULL; } static int show_object(struct seq_file *seq, void *v) { struct kfence_metadata *meta = &kfence_metadata[(long)v - 1]; unsigned long flags; raw_spin_lock_irqsave(&meta->lock, flags); kfence_print_object(seq, meta); raw_spin_unlock_irqrestore(&meta->lock, flags); seq_puts(seq, "---------------------------------\n"); return 0; } static const struct seq_operations objects_sops = { .start = start_object, .next = next_object, .stop = stop_object, .show = show_object, }; DEFINE_SEQ_ATTRIBUTE(objects); static int kfence_debugfs_init(void) { struct dentry *kfence_dir; if (!READ_ONCE(kfence_enabled)) return 0; kfence_dir = debugfs_create_dir("kfence", NULL); debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops); debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops); return 0; } late_initcall(kfence_debugfs_init); /* === Panic Notifier ====================================================== */ static void kfence_check_all_canary(void) { int i; for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { struct kfence_metadata *meta = &kfence_metadata[i]; if (kfence_obj_allocated(meta)) check_canary(meta); } } static int kfence_check_canary_callback(struct notifier_block *nb, unsigned long reason, void *arg) { kfence_check_all_canary(); return NOTIFY_OK; } static struct notifier_block kfence_check_canary_notifier = { .notifier_call = kfence_check_canary_callback, }; /* === Allocation Gate Timer ================================================ */ static struct delayed_work kfence_timer; #ifdef CONFIG_KFENCE_STATIC_KEYS /* Wait queue to wake up allocation-gate timer task. */ static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); static void wake_up_kfence_timer(struct irq_work *work) { wake_up(&allocation_wait); } static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer); #endif /* * Set up delayed work, which will enable and disable the static key. We need to * use a work queue (rather than a simple timer), since enabling and disabling a * static key cannot be done from an interrupt. * * Note: Toggling a static branch currently causes IPIs, and here we'll end up * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with * more aggressive sampling intervals), we could get away with a variant that * avoids IPIs, at the cost of not immediately capturing allocations if the * instructions remain cached. */ static void toggle_allocation_gate(struct work_struct *work) { if (!READ_ONCE(kfence_enabled)) return; atomic_set(&kfence_allocation_gate, -kfence_burst); #ifdef CONFIG_KFENCE_STATIC_KEYS /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate) > 0); /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif queue_delayed_work(system_unbound_wq, &kfence_timer, msecs_to_jiffies(kfence_sample_interval)); } /* === Public interface ===================================================== */ void __init kfence_alloc_pool_and_metadata(void) { if (!kfence_sample_interval) return; /* * If the pool has already been initialized by arch, there is no need to * re-allocate the memory pool. */ if (!__kfence_pool) __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); if (!__kfence_pool) { pr_err("failed to allocate pool\n"); return; } /* The memory allocated by memblock has been zeroed out. */ kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE); if (!kfence_metadata_init) { pr_err("failed to allocate metadata\n"); memblock_free(__kfence_pool, KFENCE_POOL_SIZE); __kfence_pool = NULL; } } static void kfence_init_enable(void) { if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS)) static_branch_enable(&kfence_allocation_key); if (kfence_deferrable) INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate); else INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate); if (kfence_check_on_panic) atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier); WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, (void *)(__kfence_pool + KFENCE_POOL_SIZE)); } void __init kfence_init(void) { stack_hash_seed = get_random_u32(); /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ if (!kfence_sample_interval) return; if (!kfence_init_pool_early()) { pr_err("%s failed\n", __func__); return; } kfence_init_enable(); } static int kfence_init_late(void) { const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE; const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE; unsigned long addr = (unsigned long)__kfence_pool; unsigned long free_size = KFENCE_POOL_SIZE; int err = -ENOMEM; #ifdef CONFIG_CONTIG_ALLOC struct page *pages; pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node, NULL); if (!pages) return -ENOMEM; __kfence_pool = page_to_virt(pages); pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node, NULL); if (pages) kfence_metadata_init = page_to_virt(pages); #else if (nr_pages_pool > MAX_ORDER_NR_PAGES || nr_pages_meta > MAX_ORDER_NR_PAGES) { pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n"); return -EINVAL; } __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL); if (!__kfence_pool) return -ENOMEM; kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL); #endif if (!kfence_metadata_init) goto free_pool; memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE); addr = kfence_init_pool(); if (!addr) { kfence_init_enable(); kfence_debugfs_init(); return 0; } pr_err("%s failed\n", __func__); free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); err = -EBUSY; #ifdef CONFIG_CONTIG_ALLOC free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)), nr_pages_meta); free_pool: free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE); #else free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE); free_pool: free_pages_exact((void *)addr, free_size); #endif kfence_metadata_init = NULL; __kfence_pool = NULL; return err; } static int kfence_enable_late(void) { if (!__kfence_pool) return kfence_init_late(); WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); pr_info("re-enabled\n"); return 0; } void kfence_shutdown_cache(struct kmem_cache *s) { unsigned long flags; struct kfence_metadata *meta; int i; /* Pairs with release in kfence_init_pool(). */ if (!smp_load_acquire(&kfence_metadata)) return; for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { bool in_use; meta = &kfence_metadata[i]; /* * If we observe some inconsistent cache and state pair where we * should have returned false here, cache destruction is racing * with either kmem_cache_alloc() or kmem_cache_free(). Taking * the lock will not help, as different critical section * serialization will have the same outcome. */ if (READ_ONCE(meta->cache) != s || !kfence_obj_allocated(meta)) continue; raw_spin_lock_irqsave(&meta->lock, flags); in_use = meta->cache == s && kfence_obj_allocated(meta); raw_spin_unlock_irqrestore(&meta->lock, flags); if (in_use) { /* * This cache still has allocations, and we should not * release them back into the freelist so they can still * safely be used and retain the kernel's default * behaviour of keeping the allocations alive (leak the * cache); however, they effectively become "zombie * allocations" as the KFENCE objects are the only ones * still in use and the owning cache is being destroyed. * * We mark them freed, so that any subsequent use shows * more useful error messages that will include stack * traces of the user of the object, the original * allocation, and caller to shutdown_cache(). */ kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true); } } for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { meta = &kfence_metadata[i]; /* See above. */ if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED) continue; raw_spin_lock_irqsave(&meta->lock, flags); if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED) meta->cache = NULL; raw_spin_unlock_irqrestore(&meta->lock, flags); } } void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { unsigned long stack_entries[KFENCE_STACK_DEPTH]; size_t num_stack_entries; u32 alloc_stack_hash; int allocation_gate; /* * Perform size check before switching kfence_allocation_gate, so that * we don't disable KFENCE without making an allocation. */ if (size > PAGE_SIZE) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; } /* * Skip allocations from non-default zones, including DMA. We cannot * guarantee that pages in the KFENCE pool will have the requested * properties (e.g. reside in DMAable memory). */ if ((flags & GFP_ZONEMASK) || (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; } /* * Skip allocations for this slab, if KFENCE has been disabled for * this slab. */ if (s->flags & SLAB_SKIP_KFENCE) return NULL; allocation_gate = atomic_inc_return(&kfence_allocation_gate); if (allocation_gate > 1) return NULL; #ifdef CONFIG_KFENCE_STATIC_KEYS /* * waitqueue_active() is fully ordered after the update of * kfence_allocation_gate per atomic_inc_return(). */ if (allocation_gate == 1 && waitqueue_active(&allocation_wait)) { /* * Calling wake_up() here may deadlock when allocations happen * from within timer code. Use an irq_work to defer it. */ irq_work_queue(&wake_up_kfence_timer_work); } #endif if (!READ_ONCE(kfence_enabled)) return NULL; num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0); /* * Do expensive check for coverage of allocation in slow-path after * allocation_gate has already become non-zero, even though it might * mean not making any allocation within a given sample interval. * * This ensures reasonable allocation coverage when the pool is almost * full, including avoiding long-lived allocations of the same source * filling up the pool (e.g. pagecache allocations). */ alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries); if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]); return NULL; } return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries, alloc_stack_hash); } size_t kfence_ksize(const void *addr) { const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); /* * Read locklessly -- if there is a race with __kfence_alloc(), this is * either a use-after-free or invalid access. */ return meta ? meta->size : 0; } void *kfence_object_start(const void *addr) { const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); /* * Read locklessly -- if there is a race with __kfence_alloc(), this is * either a use-after-free or invalid access. */ return meta ? (void *)meta->addr : NULL; } void __kfence_free(void *addr) { struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); #ifdef CONFIG_MEMCG KFENCE_WARN_ON(meta->obj_exts.objcg); #endif /* * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing * the object, as the object page may be recycled for other-typed * objects once it has been freed. meta->cache may be NULL if the cache * was destroyed. * Save the stack trace here so that reports show where the user freed * the object. */ if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) { unsigned long flags; raw_spin_lock_irqsave(&meta->lock, flags); metadata_update_state(meta, KFENCE_OBJECT_RCU_FREEING, NULL, 0); raw_spin_unlock_irqrestore(&meta->lock, flags); call_rcu(&meta->rcu_head, rcu_guarded_free); } else { kfence_guarded_free(addr, meta, false); } } bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) { const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; struct kfence_metadata *to_report = NULL; enum kfence_error_type error_type; unsigned long flags; if (!is_kfence_address((void *)addr)) return false; if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */ return kfence_unprotect(addr); /* ... unprotect and proceed. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); if (page_index % 2) { /* This is a redzone, report a buffer overflow. */ struct kfence_metadata *meta; int distance = 0; meta = addr_to_metadata(addr - PAGE_SIZE); if (meta && kfence_obj_allocated(meta)) { to_report = meta; /* Data race ok; distance calculation approximate. */ distance = addr - data_race(meta->addr + meta->size); } meta = addr_to_metadata(addr + PAGE_SIZE); if (meta && kfence_obj_allocated(meta)) { /* Data race ok; distance calculation approximate. */ if (!to_report || distance > data_race(meta->addr) - addr) to_report = meta; } if (!to_report) goto out; raw_spin_lock_irqsave(&to_report->lock, flags); to_report->unprotected_page = addr; error_type = KFENCE_ERROR_OOB; /* * If the object was freed before we took the look we can still * report this as an OOB -- the report will simply show the * stacktrace of the free as well. */ } else { to_report = addr_to_metadata(addr); if (!to_report) goto out; raw_spin_lock_irqsave(&to_report->lock, flags); error_type = KFENCE_ERROR_UAF; /* * We may race with __kfence_alloc(), and it is possible that a * freed object may be reallocated. We simply report this as a * use-after-free, with the stack trace showing the place where * the object was re-allocated. */ } out: if (to_report) { kfence_report_error(addr, is_write, regs, to_report, error_type); raw_spin_unlock_irqrestore(&to_report->lock, flags); } else { /* This may be a UAF or OOB access, but we can't be sure. */ kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); } return kfence_unprotect(addr); /* Unprotect and let access proceed. */ }
227 227 227 227 125 2 137 226 224 23 125 137 139 137 2 139 247 247 139 139 138 26 26 26 4 4 223 6 225 225 6 1 226 226 224 19 17 226 227 226 227 18 225 225 6 223 137 138 6 17 13 138 138 138 17 138 138 137 3 83 98 225 226 225 226 138 138 137 138 21 136 138 3 138 3 224 125 125 1 18 18 7 2 226 190 51 4 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions handle output processing. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> #include <linux/time.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/init.h> #include <linux/slab.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/icmp.h> #include <net/net_namespace.h> #include <linux/socket.h> /* for sa_family_t */ #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> /* Forward declarations for private helpers. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len); static void sctp_packet_reset(struct sctp_packet *packet) { /* sctp_packet_transmit() relies on this to reset size to the * current overhead after sending packets. */ packet->size = packet->overhead; packet->has_cookie_echo = 0; packet->has_sack = 0; packet->has_data = 0; packet->has_auth = 0; packet->ipfragok = 0; packet->auth = NULL; } /* Config a packet. * This appears to be a followup set of initializations. */ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, int ecn_capable) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_sock *sp = NULL; struct sock *sk; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; /* do the following jobs only once for a flush schedule */ if (!sctp_packet_empty(packet)) return; /* set packet max_size with pathmtu, then calculate overhead */ packet->max_size = tp->pathmtu; if (asoc) { sk = asoc->base.sk; sp = sctp_sk(sk); } packet->overhead = sctp_mtu_payload(sp, 0, 0); packet->size = p