| 575 103 1 572 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_COREDUMP_H #define _LINUX_SCHED_COREDUMP_H #include <linux/mm_types.h> #define SUID_DUMP_DISABLE 0 /* No setuid dumping */ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) { /* * By convention, dumpable bits are contained in first 32 bits of the * bitmap, so we can simply access this first unsigned long directly. */ return __mm_flags_get_word(mm); } static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) { __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value); } extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things * that are using this for checking for privilege transitions, it must * test against SUID_DUMP_USER rather than treating it as a boolean * value. */ static inline int __get_dumpable(unsigned long mm_flags) { return mm_flags & MMF_DUMPABLE_MASK; } static inline int get_dumpable(struct mm_struct *mm) { unsigned long flags = __mm_flags_get_dumpable(mm); return __get_dumpable(flags); } #endif /* _LINUX_SCHED_COREDUMP_H */ |
| 9654 9653 48 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_PGTABLE_INVERT_H #define _ASM_PGTABLE_INVERT_H 1 #ifndef __ASSEMBLER__ /* * A clear pte value is special, and doesn't get inverted. * * Note that even users that only pass a pgprot_t (rather * than a full pte) won't trigger the special zero case, * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED * set. So the all zero case really is limited to just the * cleared page table entry case. */ static inline bool __pte_needs_invert(u64 val) { return val && !(val & _PAGE_PRESENT); } /* Get a mask to xor with the page table entry to get the correct pfn. */ static inline u64 protnone_mask(u64 val) { return __pte_needs_invert(val) ? ~0ull : 0; } static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) { /* * When a PTE transitions from NONE to !NONE or vice-versa * invert the PFN part to stop speculation. * pte_pfn undoes this when needed. */ if (__pte_needs_invert(oldval) != __pte_needs_invert(val)) val = (val & ~mask) | (~val & mask); return val; } #endif /* __ASSEMBLER__ */ #endif |
| 780 4 4 780 777 6 1 79 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock - Credential hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI * Copyright © 2024-2025 Microsoft Corporation */ #include <linux/binfmts.h> #include <linux/cred.h> #include <linux/lsm_hooks.h> #include "common.h" #include "cred.h" #include "ruleset.h" #include "setup.h" static void hook_cred_transfer(struct cred *const new, const struct cred *const old) { const struct landlock_cred_security *const old_llcred = landlock_cred(old); if (old_llcred->domain) { landlock_get_ruleset(old_llcred->domain); *landlock_cred(new) = *old_llcred; } } static int hook_cred_prepare(struct cred *const new, const struct cred *const old, const gfp_t gfp) { hook_cred_transfer(new, old); return 0; } static void hook_cred_free(struct cred *const cred) { struct landlock_ruleset *const dom = landlock_cred(cred)->domain; if (dom) landlock_put_ruleset_deferred(dom); } #ifdef CONFIG_AUDIT static int hook_bprm_creds_for_exec(struct linux_binprm *const bprm) { /* Resets for each execution. */ landlock_cred(bprm->cred)->domain_exec = 0; return 0; } #endif /* CONFIG_AUDIT */ static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, hook_cred_prepare), LSM_HOOK_INIT(cred_transfer, hook_cred_transfer), LSM_HOOK_INIT(cred_free, hook_cred_free), #ifdef CONFIG_AUDIT LSM_HOOK_INIT(bprm_creds_for_exec, hook_bprm_creds_for_exec), #endif /* CONFIG_AUDIT */ }; __init void landlock_add_cred_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); } |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | /* * Written by: Matthew Dobson, IBM Corporation * * Copyright (C) 2002, IBM Corp. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Send feedback to <colpatch@us.ibm.com> */ #ifndef _ASM_X86_TOPOLOGY_H #define _ASM_X86_TOPOLOGY_H /* * to preserve the visibility of NUMA_NO_NODE definition, * moved to there from here. May be used independent of * CONFIG_NUMA. */ #include <linux/numa.h> #include <linux/cpumask.h> #ifdef CONFIG_NUMA #include <asm/mpspec.h> #include <asm/percpu.h> /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); #ifdef CONFIG_DEBUG_PER_CPU_MAPS /* * override generic percpu implementation of cpu_to_node */ extern int __cpu_to_node(int cpu); #define cpu_to_node __cpu_to_node extern int early_cpu_to_node(int cpu); #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ /* Same function but used if called before per_cpu areas are setup */ static inline int early_cpu_to_node(int cpu) { return early_per_cpu(x86_cpu_to_node_map, cpu); } #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ /* Mappings between node number and cpus on that node. */ extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; #ifdef CONFIG_DEBUG_PER_CPU_MAPS extern const struct cpumask *cpumask_of_node(int node); #else /* Returns a pointer to the cpumask of CPUs on Node 'node'. */ static inline const struct cpumask *cpumask_of_node(int node) { return node_to_cpumask_map[node]; } #endif extern void setup_node_to_cpumask_map(void); #define pcibus_to_node(bus) __pcibus_to_node(bus) extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) #else /* !CONFIG_NUMA */ static inline int numa_node_id(void) { return 0; } /* * indicate override: */ #define numa_node_id numa_node_id static inline int early_cpu_to_node(int cpu) { return 0; } static inline void setup_node_to_cpumask_map(void) { } #endif #include <asm-generic/topology.h> /* Topology information */ enum x86_topology_domains { TOPO_SMT_DOMAIN, TOPO_CORE_DOMAIN, TOPO_MODULE_DOMAIN, TOPO_TILE_DOMAIN, TOPO_DIE_DOMAIN, TOPO_DIEGRP_DOMAIN, TOPO_PKG_DOMAIN, TOPO_MAX_DOMAIN, }; enum x86_topology_cpu_type { TOPO_CPU_TYPE_PERFORMANCE, TOPO_CPU_TYPE_EFFICIENCY, TOPO_CPU_TYPE_UNKNOWN, }; struct x86_topology_system { unsigned int dom_shifts[TOPO_MAX_DOMAIN]; unsigned int dom_size[TOPO_MAX_DOMAIN]; }; extern struct x86_topology_system x86_topo_system; static inline unsigned int topology_get_domain_size(enum x86_topology_domains dom) { return x86_topo_system.dom_size[dom]; } static inline unsigned int topology_get_domain_shift(enum x86_topology_domains dom) { return dom == TOPO_SMT_DOMAIN ? 0 : x86_topo_system.dom_shifts[dom - 1]; } extern const struct cpumask *cpu_coregroup_mask(int cpu); extern const struct cpumask *cpu_clustergroup_mask(int cpu); #define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) #define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) #define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id) #define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_ppin(cpu) (cpu_data(cpu).ppin) #define topology_amd_node_id(cpu) (cpu_data(cpu).topo.amd_node_id) extern unsigned int __max_dies_per_package; extern unsigned int __max_logical_packages; extern unsigned int __max_threads_per_core; extern unsigned int __num_threads_per_package; extern unsigned int __num_cores_per_package; const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c); enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c); static inline unsigned int topology_max_packages(void) { return __max_logical_packages; } static inline unsigned int topology_max_dies_per_package(void) { return __max_dies_per_package; } static inline unsigned int topology_num_cores_per_package(void) { return __num_cores_per_package; } static inline unsigned int topology_num_threads_per_package(void) { return __num_threads_per_package; } #ifdef CONFIG_X86_LOCAL_APIC int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level); #else static inline int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) { return 0; } #endif #ifdef CONFIG_SMP #define topology_cluster_id(cpu) (cpu_data(cpu).topo.l2c_id) #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return topology_get_logical_id(pkg << x86_topo_system.dom_shifts[TOPO_PKG_DOMAIN], TOPO_PKG_DOMAIN); } extern int __max_smt_threads; static inline int topology_max_smt_threads(void) { return __max_smt_threads; } #include <linux/cpu_smt.h> extern unsigned int __amd_nodes_per_pkg; static inline unsigned int topology_amd_nodes_per_pkg(void) { return __amd_nodes_per_pkg; } extern struct cpumask __cpu_primary_thread_mask; #define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) /** * topology_is_primary_thread - Check whether CPU is the primary SMT thread * @cpu: CPU to check */ static inline bool topology_is_primary_thread(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_primary_thread_mask); } #define topology_is_primary_thread topology_is_primary_thread int topology_get_primary_thread(unsigned int cpu); static inline bool topology_is_core_online(unsigned int cpu) { int pcpu = topology_get_primary_thread(cpu); return pcpu >= 0 ? cpu_online(pcpu) : false; } #define topology_is_core_online topology_is_core_online #else /* CONFIG_SMP */ static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } static inline int topology_max_smt_threads(void) { return 1; } static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } #endif /* !CONFIG_SMP */ static inline void arch_fix_phys_package_id(int num, u32 slot) { } struct pci_bus; int x86_pci_root_bus_node(int bus); void x86_pci_root_bus_resources(int bus, struct list_head *resources); extern bool x86_topology_update; #ifdef CONFIG_SCHED_MC_PRIO #include <asm/percpu.h> DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority); extern bool __read_mostly sysctl_sched_itmt_enabled; /* Interface to set priority of a cpu */ void sched_set_itmt_core_prio(int prio, int core_cpu); /* Interface to notify scheduler that system supports ITMT */ int sched_set_itmt_support(void); /* Interface to notify scheduler that system revokes ITMT support */ void sched_clear_itmt_support(void); #else /* CONFIG_SCHED_MC_PRIO */ #define sysctl_sched_itmt_enabled false static inline void sched_set_itmt_core_prio(int prio, int core_cpu) { } static inline int sched_set_itmt_support(void) { return 0; } static inline void sched_clear_itmt_support(void) { } #endif /* CONFIG_SCHED_MC_PRIO */ #if defined(CONFIG_SMP) && defined(CONFIG_X86_64) #include <asm/cpufeature.h> DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key); #define arch_scale_freq_invariant() static_branch_likely(&arch_scale_freq_key) DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline long arch_scale_freq_capacity(int cpu) { return per_cpu(arch_freq_scale, cpu); } #define arch_scale_freq_capacity arch_scale_freq_capacity bool arch_enable_hybrid_capacity_scale(void); void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, unsigned long cap_freq, unsigned long base_freq); unsigned long arch_scale_cpu_capacity(int cpu); #define arch_scale_cpu_capacity arch_scale_cpu_capacity extern void arch_set_max_freq_ratio(bool turbo_disabled); extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); #else static inline bool arch_enable_hybrid_capacity_scale(void) { return false; } static inline void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, unsigned long cap_freq, unsigned long base_freq) { } static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } #endif extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick extern int arch_sched_node_distance(int from, int to); #endif /* _ASM_X86_TOPOLOGY_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Management Component Transport Protocol (MCTP) * * Copyright (c) 2021 Code Construct * Copyright (c) 2021 Google */ #ifndef __NET_MCTP_H #define __NET_MCTP_H #include <linux/bits.h> #include <linux/mctp.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <net/sock.h> /* MCTP packet definitions */ struct mctp_hdr { u8 ver; u8 dest; u8 src; u8 flags_seq_tag; }; #define MCTP_VER_MIN 1 #define MCTP_VER_MAX 1 /* Definitions for flags_seq_tag field */ #define MCTP_HDR_FLAG_SOM BIT(7) #define MCTP_HDR_FLAG_EOM BIT(6) #define MCTP_HDR_FLAG_TO BIT(3) #define MCTP_HDR_FLAGS GENMASK(5, 3) #define MCTP_HDR_SEQ_SHIFT 4 #define MCTP_HDR_SEQ_MASK GENMASK(1, 0) #define MCTP_HDR_TAG_SHIFT 0 #define MCTP_HDR_TAG_MASK GENMASK(2, 0) #define MCTP_INITIAL_DEFAULT_NET 1 static inline bool mctp_address_unicast(mctp_eid_t eid) { return eid >= 8 && eid < 255; } static inline bool mctp_address_broadcast(mctp_eid_t eid) { return eid == 255; } static inline bool mctp_address_null(mctp_eid_t eid) { return eid == 0; } static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid) { return match == eid || match == MCTP_ADDR_ANY; } static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb) { return (struct mctp_hdr *)skb_network_header(skb); } /* socket implementation */ struct mctp_sock { struct sock sk; /* bind() params */ unsigned int bind_net; mctp_eid_t bind_local_addr; mctp_eid_t bind_peer_addr; unsigned int bind_peer_net; bool bind_peer_set; __u8 bind_type; /* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */ bool addr_ext; /* list of mctp_sk_key, for incoming tag lookup. updates protected * by sk->net->keys_lock */ struct hlist_head keys; /* mechanism for expiring allocated keys; will release an allocated * tag, and any netdev state for a request/response pairing */ struct timer_list key_expiry; }; /* Key for matching incoming packets to sockets or reassembly contexts. * Packets are matched on (peer EID, local EID, tag). * * Lifetime / locking requirements: * * - individual key data (ie, the struct itself) is protected by key->lock; * changes must be made with that lock held. * * - the lookup fields: peer_addr, local_addr and tag are set before the * key is added to lookup lists, and never updated. * * - A ref to the key must be held (throuh key->refs) if a pointer to the * key is to be accessed after key->lock is released. * * - a mctp_sk_key contains a reference to a struct sock; this is valid * for the life of the key. On sock destruction (through unhash), the key is * removed from lists (see below), and marked invalid. * * - these mctp_sk_keys appear on two lists: * 1) the struct mctp_sock->keys list * 2) the struct netns_mctp->keys list * * presences on these lists requires a (single) refcount to be held; both * lists are updated as a single operation. * * Updates and lookups in either list are performed under the * netns_mctp->keys lock. Lookup functions will need to lock the key and * take a reference before unlocking the keys_lock. Consequently, the list's * keys_lock *cannot* be acquired with the individual key->lock held. * * - a key may have a sk_buff attached as part of an in-progress message * reassembly (->reasm_head). The reasm data is protected by the individual * key->lock. * * - there are two destruction paths for a mctp_sk_key: * * - through socket unhash (see mctp_sk_unhash). This performs the list * removal under keys_lock. * * - where a key is established to receive a reply message: after receiving * the (complete) reply, or during reassembly errors. Here, we clean up * the reassembly context (marking reasm_dead, to prevent another from * starting), and remove the socket from the netns & socket lists. * * - through an expiry timeout, on a per-socket timer */ struct mctp_sk_key { unsigned int net; mctp_eid_t peer_addr; mctp_eid_t local_addr; /* MCTP_ADDR_ANY for local owned tags */ __u8 tag; /* incoming tag match; invert TO for local */ /* we hold a ref to sk when set */ struct sock *sk; /* routing lookup list */ struct hlist_node hlist; /* per-socket list */ struct hlist_node sklist; /* lock protects against concurrent updates to the reassembly and * expiry data below. */ spinlock_t lock; /* Keys are referenced during the output path, which may sleep */ refcount_t refs; /* incoming fragment reassembly context */ struct sk_buff *reasm_head; struct sk_buff **reasm_tailp; bool reasm_dead; u8 last_seq; /* key validity */ bool valid; /* expiry timeout; valid (above) cleared on expiry */ unsigned long expiry; /* free to use for device flow state tracking. Initialised to * zero on initial key creation */ unsigned long dev_flow_state; struct mctp_dev *dev; /* a tag allocated with SIOCMCTPALLOCTAG ioctl will not expire * automatically on timeout or response, instead SIOCMCTPDROPTAG * is used. */ bool manual_alloc; }; struct mctp_skb_cb { unsigned int magic; unsigned int net; /* fields below provide extended addressing for ingress to recvmsg() */ int ifindex; unsigned char halen; unsigned char haddr[MAX_ADDR_LEN]; }; /* skb control-block accessors with a little extra debugging for initial * development. * * TODO: remove checks & mctp_skb_cb->magic; replace callers of __mctp_cb * with mctp_cb(). * * __mctp_cb() is only for the initial ingress code; we should see ->magic set * at all times after this. */ static inline struct mctp_skb_cb *__mctp_cb(struct sk_buff *skb) { struct mctp_skb_cb *cb = (void *)skb->cb; cb->magic = 0x4d435450; return cb; } static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb) { struct mctp_skb_cb *cb = (void *)skb->cb; BUILD_BUG_ON(sizeof(struct mctp_skb_cb) > sizeof(skb->cb)); WARN_ON(cb->magic != 0x4d435450); return cb; } /* If CONFIG_MCTP_FLOWS, we may add one of these as a SKB extension, * indicating the flow to the device driver. */ struct mctp_flow { struct mctp_sk_key *key; }; struct mctp_dst; /* Route definition. * * These are held in the pernet->mctp.routes list, with RCU protection for * removed routes. We hold a reference to the netdev; routes need to be * dropped on NETDEV_UNREGISTER events. * * Updates to the route table are performed under rtnl; all reads under RCU, * so routes cannot be referenced over a RCU grace period. */ struct mctp_route { mctp_eid_t min, max; unsigned char type; unsigned int mtu; enum { MCTP_ROUTE_DIRECT, MCTP_ROUTE_GATEWAY, } dst_type; union { struct mctp_dev *dev; struct mctp_fq_addr gateway; }; int (*output)(struct mctp_dst *dst, struct sk_buff *skb); struct list_head list; refcount_t refs; struct rcu_head rcu; }; /* Route lookup result: dst. Represents the results of a routing decision, * but is only held over the individual routing operation. * * Will typically be stored on the caller stack, and must be released after * usage. */ struct mctp_dst { struct mctp_dev *dev; unsigned int mtu; mctp_eid_t nexthop; /* set for direct addressing */ unsigned char halen; unsigned char haddr[MAX_ADDR_LEN]; int (*output)(struct mctp_dst *dst, struct sk_buff *skb); }; int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, unsigned char halen, const unsigned char *haddr); /* route interfaces */ int mctp_route_lookup(struct net *net, unsigned int dnet, mctp_eid_t daddr, struct mctp_dst *dst); void mctp_dst_release(struct mctp_dst *dst); /* always takes ownership of skb */ int mctp_local_output(struct sock *sk, struct mctp_dst *dst, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, unsigned int netid, mctp_eid_t local, mctp_eid_t peer, bool manual, u8 *tagp); /* routing <--> device interface */ unsigned int mctp_default_net(struct net *net); int mctp_default_net_set(struct net *net, unsigned int index); int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr); int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr); void mctp_route_remove_dev(struct mctp_dev *mdev); /* neighbour definitions */ enum mctp_neigh_source { MCTP_NEIGH_STATIC, MCTP_NEIGH_DISCOVER, }; struct mctp_neigh { struct mctp_dev *dev; mctp_eid_t eid; enum mctp_neigh_source source; unsigned char ha[MAX_ADDR_LEN]; struct list_head list; struct rcu_head rcu; }; int mctp_neigh_init(void); void mctp_neigh_exit(void); // ret_hwaddr may be NULL, otherwise must have space for MAX_ADDR_LEN int mctp_neigh_lookup(struct mctp_dev *dev, mctp_eid_t eid, void *ret_hwaddr); void mctp_neigh_remove_dev(struct mctp_dev *mdev); int mctp_routes_init(void); void mctp_routes_exit(void); int mctp_device_init(void); void mctp_device_exit(void); /* MCTP IDs and Codes from DMTF specification * "DSP0239 Management Component Transport Protocol (MCTP) IDs and Codes" * https://www.dmtf.org/sites/default/files/standards/documents/DSP0239_1.11.1.pdf */ enum mctp_phys_binding { MCTP_PHYS_BINDING_UNSPEC = 0x00, MCTP_PHYS_BINDING_SMBUS = 0x01, MCTP_PHYS_BINDING_PCIE_VDM = 0x02, MCTP_PHYS_BINDING_USB = 0x03, MCTP_PHYS_BINDING_KCS = 0x04, MCTP_PHYS_BINDING_SERIAL = 0x05, MCTP_PHYS_BINDING_I3C = 0x06, MCTP_PHYS_BINDING_MMBI = 0x07, MCTP_PHYS_BINDING_PCC = 0x08, MCTP_PHYS_BINDING_UCIE = 0x09, MCTP_PHYS_BINDING_VENDOR = 0xFF, }; #endif /* __NET_MCTP_H */ |
| 422 422 422 422 9 9 423 423 423 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto_tcp.c: TCP load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * tcp_timeouts table has copy per netns in a hash table per * protocol ip_vs_proto_data and is handled by netns */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/ip.h> #include <linux/tcp.h> /* for tcphdr */ #include <net/ip.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/indirect_call_wrapper.h> #include <net/ip_vs.h> static int tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct tcphdr _tcph, *th; __be16 _ports[2], *ports = NULL; /* In the event of icmp, we're only guaranteed to have the first 8 * bytes of the transport header, so we only check the rest of the * TCP packet for non-ICMP packets */ if (likely(!ip_vs_iph_icmp(iph))) { th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th) { if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) return 1; ports = &th->source; } } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ *verdict = NF_DROP; return 0; } /* * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; return 0; } } /* NF_ACCEPT */ return 1; } static inline void tcp_fast_csum_update(int af, struct tcphdr *tcph, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); else #endif tcph->check = csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); } static inline void tcp_partial_csum_update(int af, struct tcphdr *tcph, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldlen, __be16 newlen) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, csum_unfold(tcph->check)))); else #endif tcph->check = ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, csum_unfold(tcph->check)))); } INDIRECT_CALLABLE_SCOPE int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; unsigned int tcphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - tcphoff; else payload_csum = true; } tcph = (void *)skb_network_header(skb) + tcphoff; tcph->source = cp->vport; /* Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) tcph->check = csum_ipv6_magic(&cp->vaddr.in6, &cp->caddr.in6, skb->len - tcphoff, cp->protocol, skb->csum); else #endif tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, (char*)&(tcph->check) - (char*)tcph); } return 1; } static int tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; unsigned int tcphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - tcphoff; else payload_csum = true; } tcph = (void *)skb_network_header(skb) + tcphoff; tcph->dest = cp->dport; /* * Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) tcph->check = csum_ipv6_magic(&cp->caddr.in6, &cp->daddr.in6, skb->len - tcphoff, cp->protocol, skb->csum); else #endif tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { unsigned int tcphoff; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcphoff = sizeof(struct ipv6hdr); else #endif tcphoff = ip_hdrlen(skb); switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); fallthrough; case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - tcphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } } else #endif if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - tcphoff, ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } break; default: /* No need to checksum. */ break; } return 1; } #define TCP_DIR_INPUT 0 #define TCP_DIR_OUTPUT 4 #define TCP_DIR_INPUT_ONLY 8 static const int tcp_state_off[IP_VS_DIR_LAST] = { [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, }; /* * Timeout table[state] */ static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = 2*HZ, [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, [IP_VS_TCP_S_CLOSE] = 10*HZ, [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, [IP_VS_TCP_S_LAST_ACK] = 30*HZ, [IP_VS_TCP_S_LISTEN] = 2*60*HZ, [IP_VS_TCP_S_SYNACK] = 120*HZ, [IP_VS_TCP_S_LAST] = 2*HZ, }; static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = "NONE", [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", [IP_VS_TCP_S_CLOSE] = "CLOSE", [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", [IP_VS_TCP_S_LISTEN] = "LISTEN", [IP_VS_TCP_S_SYNACK] = "SYNACK", [IP_VS_TCP_S_LAST] = "BUG!", }; static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = { [IP_VS_TCP_S_NONE] = false, [IP_VS_TCP_S_ESTABLISHED] = true, [IP_VS_TCP_S_SYN_SENT] = true, [IP_VS_TCP_S_SYN_RECV] = true, [IP_VS_TCP_S_FIN_WAIT] = false, [IP_VS_TCP_S_TIME_WAIT] = false, [IP_VS_TCP_S_CLOSE] = false, [IP_VS_TCP_S_CLOSE_WAIT] = false, [IP_VS_TCP_S_LAST_ACK] = false, [IP_VS_TCP_S_LISTEN] = false, [IP_VS_TCP_S_SYNACK] = true, }; #define sNO IP_VS_TCP_S_NONE #define sES IP_VS_TCP_S_ESTABLISHED #define sSS IP_VS_TCP_S_SYN_SENT #define sSR IP_VS_TCP_S_SYN_RECV #define sFW IP_VS_TCP_S_FIN_WAIT #define sTW IP_VS_TCP_S_TIME_WAIT #define sCL IP_VS_TCP_S_CLOSE #define sCW IP_VS_TCP_S_CLOSE_WAIT #define sLA IP_VS_TCP_S_LAST_ACK #define sLI IP_VS_TCP_S_LISTEN #define sSA IP_VS_TCP_S_SYNACK struct tcp_states_t { int next_state[IP_VS_TCP_S_LAST]; }; static const char * tcp_state_name(int state) { if (state >= IP_VS_TCP_S_LAST) return "ERR!"; return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; } static bool tcp_state_active(int state) { if (state >= IP_VS_TCP_S_LAST) return false; return tcp_state_active_table[state]; } static struct tcp_states_t tcp_states[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, /* OUTPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, /* INPUT-ONLY */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; static struct tcp_states_t tcp_states_dos[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, /* OUTPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, /* INPUT-ONLY */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) { int on = (flags & 1); /* secure_tcp */ /* ** FIXME: change secure_tcp to independent sysctl var ** or make it per-service or per-app because it is valid ** for most if not for all of the applications. Something ** like "capabilities" (flags) for each object. */ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); } static inline int tcp_state_idx(struct tcphdr *th) { if (th->rst) return 3; if (th->syn) return 0; if (th->fin) return 1; if (th->ack) return 2; return -1; } static inline void set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, struct tcphdr *th) { int state_idx; int new_state = IP_VS_TCP_S_CLOSE; int state_off = tcp_state_off[direction]; /* * Update state offset to INPUT_ONLY if necessary * or delete NO_OUTPUT flag if output packet detected */ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { if (state_off == TCP_DIR_OUTPUT) cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; else state_off = TCP_DIR_INPUT_ONLY; } if ((state_idx = tcp_state_idx(th)) < 0) { IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); goto tcp_state_out; } new_state = pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; tcp_state_out: if (new_state != cp->state) { struct ip_vs_dest *dest = cp->dest; IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " "d:%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), th->syn ? 'S' : '.', th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), tcp_state_name(cp->state), tcp_state_name(new_state), refcount_read(&cp->refcnt)); if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && !tcp_state_active(new_state)) { atomic_dec(&dest->activeconns); atomic_inc(&dest->inactconns); cp->flags |= IP_VS_CONN_F_INACTIVE; } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && tcp_state_active(new_state)) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } if (new_state == IP_VS_TCP_S_ESTABLISHED) ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = new_state]; else /* What to do ? */ cp->timeout = tcp_timeouts[cp->state = new_state]; } /* * Handle state transitions */ static void tcp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { struct tcphdr _tcph, *th; #ifdef CONFIG_IP_VS_IPV6 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); #else int ihl = ip_hdrlen(skb); #endif th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); if (th == NULL) return; spin_lock_bh(&cp->lock); set_tcp_state(pd, cp, direction, th); spin_unlock_bh(&cp->lock); } static inline __u16 tcp_app_hashkey(__be16 port) { return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) & TCP_APP_TAB_MASK; } static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); hash = tcp_app_hashkey(port); list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); atomic_inc(&pd->appcnt); out: return ret; } static void tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); } static int tcp_app_conn_bind(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; /* Default binding: bind app only for NAT */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return 0; /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); break; } } return result; } /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP); spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] : tcp_timeouts[IP_VS_TCP_S_LISTEN]); spin_unlock_bh(&cp->lock); } /* --------------------------------------------- * timeouts is netns related now. * --------------------------------------------- */ static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); if (!pd->timeout_table) return -ENOMEM; pd->tcp_state_table = tcp_states; return 0; } static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_tcp = { .name = "TCP", .protocol = IPPROTO_TCP, .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, .init = NULL, .exit = NULL, .init_netns = __ip_vs_tcp_init, .exit_netns = __ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = tcp_snat_handler, .dnat_handler = tcp_dnat_handler, .state_name = tcp_state_name, .state_transition = tcp_state_transition, .app_conn_bind = tcp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = tcp_timeout_change, }; |
| 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 | /* * videobuf2-v4l2.h - V4L2 driver helper framework * * Copyright (C) 2010 Samsung Electronics * * Author: Pawel Osciak <pawel@osciak.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation. */ #ifndef _MEDIA_VIDEOBUF2_V4L2_H #define _MEDIA_VIDEOBUF2_V4L2_H #include <linux/videodev2.h> #include <media/videobuf2-core.h> #if VB2_MAX_FRAME != VIDEO_MAX_FRAME #error VB2_MAX_FRAME != VIDEO_MAX_FRAME #endif #if VB2_MAX_PLANES != VIDEO_MAX_PLANES #error VB2_MAX_PLANES != VIDEO_MAX_PLANES #endif struct video_device; /** * struct vb2_v4l2_buffer - video buffer information for v4l2. * * @vb2_buf: embedded struct &vb2_buffer. * @flags: buffer informational flags. * @field: field order of the image in the buffer, as defined by * &enum v4l2_field. * @timecode: frame timecode. * @sequence: sequence count of this frame. * @request_fd: the request_fd associated with this buffer * @is_held: if true, then this capture buffer was held * @planes: plane information (userptr/fd, length, bytesused, data_offset). * * Should contain enough information to be able to cover all the fields * of &struct v4l2_buffer at ``videodev2.h``. */ struct vb2_v4l2_buffer { struct vb2_buffer vb2_buf; __u32 flags; __u32 field; struct v4l2_timecode timecode; __u32 sequence; __s32 request_fd; bool is_held; struct vb2_plane planes[VB2_MAX_PLANES]; }; /* VB2 V4L2 flags as set in vb2_queue.subsystem_flags */ #define VB2_V4L2_FL_SUPPORTS_M2M_HOLD_CAPTURE_BUF (1 << 0) /* * to_vb2_v4l2_buffer() - cast struct vb2_buffer * to struct vb2_v4l2_buffer * */ #define to_vb2_v4l2_buffer(vb) \ container_of(vb, struct vb2_v4l2_buffer, vb2_buf) /** * vb2_find_buffer() - Find a buffer with given timestamp * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @timestamp: the timestamp to find. * * Returns the buffer with the given @timestamp, or NULL if not found. */ struct vb2_buffer *vb2_find_buffer(struct vb2_queue *q, u64 timestamp); int vb2_querybuf(struct vb2_queue *q, struct v4l2_buffer *b); /** * vb2_reqbufs() - Wrapper for vb2_core_reqbufs() that also verifies * the memory and type values. * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @req: &struct v4l2_requestbuffers passed from userspace to * &v4l2_ioctl_ops->vidioc_reqbufs handler in driver. */ int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req); /** * vb2_create_bufs() - Wrapper for vb2_core_create_bufs() that also verifies * the memory and type values. * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @create: creation parameters, passed from userspace to * &v4l2_ioctl_ops->vidioc_create_bufs handler in driver */ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create); /** * vb2_prepare_buf() - Pass ownership of a buffer from userspace to the kernel * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @mdev: pointer to &struct media_device, may be NULL. * @b: buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_prepare_buf handler in driver * * Should be called from &v4l2_ioctl_ops->vidioc_prepare_buf ioctl handler * of a driver. * * This function: * * #) verifies the passed buffer, * #) calls &vb2_ops->buf_prepare callback in the driver (if provided), * in which driver-specific buffer initialization can be performed. * #) if @b->request_fd is non-zero and @mdev->ops->req_queue is set, * then bind the prepared buffer to the request. * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_prepare_buf handler in driver. */ int vb2_prepare_buf(struct vb2_queue *q, struct media_device *mdev, struct v4l2_buffer *b); /** * vb2_qbuf() - Queue a buffer from userspace * @q: pointer to &struct vb2_queue with videobuf2 queue. * @mdev: pointer to &struct media_device, may be NULL. * @b: buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_qbuf handler in driver * * Should be called from &v4l2_ioctl_ops->vidioc_qbuf handler of a driver. * * This function: * * #) verifies the passed buffer; * #) if @b->request_fd is non-zero and @mdev->ops->req_queue is set, * then bind the buffer to the request. * #) if necessary, calls &vb2_ops->buf_prepare callback in the driver * (if provided), in which driver-specific buffer initialization can * be performed; * #) if streaming is on, queues the buffer in driver by the means of * &vb2_ops->buf_queue callback for processing. * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_qbuf handler in driver. */ int vb2_qbuf(struct vb2_queue *q, struct media_device *mdev, struct v4l2_buffer *b); /** * vb2_expbuf() - Export a buffer as a file descriptor * @q: pointer to &struct vb2_queue with videobuf2 queue. * @eb: export buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_expbuf handler in driver * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_expbuf handler in driver. */ int vb2_expbuf(struct vb2_queue *q, struct v4l2_exportbuffer *eb); /** * vb2_dqbuf() - Dequeue a buffer to the userspace * @q: pointer to &struct vb2_queue with videobuf2 queue. * @b: buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_dqbuf handler in driver * @nonblocking: if true, this call will not sleep waiting for a buffer if no * buffers ready for dequeuing are present. Normally the driver * would be passing (&file->f_flags & %O_NONBLOCK) here * * Should be called from &v4l2_ioctl_ops->vidioc_dqbuf ioctl handler * of a driver. * * This function: * * #) verifies the passed buffer; * #) calls &vb2_ops->buf_finish callback in the driver (if provided), in which * driver can perform any additional operations that may be required before * returning the buffer to userspace, such as cache sync; * #) the buffer struct members are filled with relevant information for * the userspace. * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_dqbuf handler in driver. */ int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b, bool nonblocking); /** * vb2_streamon - start streaming * @q: pointer to &struct vb2_queue with videobuf2 queue. * @type: type argument passed from userspace to vidioc_streamon handler, * as defined by &enum v4l2_buf_type. * * Should be called from &v4l2_ioctl_ops->vidioc_streamon handler of a driver. * * This function: * * 1) verifies current state * 2) passes any previously queued buffers to the driver and starts streaming * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_streamon handler in the driver. */ int vb2_streamon(struct vb2_queue *q, enum v4l2_buf_type type); /** * vb2_streamoff - stop streaming * @q: pointer to &struct vb2_queue with videobuf2 queue. * @type: type argument passed from userspace to vidioc_streamoff handler * * Should be called from vidioc_streamoff handler of a driver. * * This function: * * #) verifies current state, * #) stop streaming and dequeues any queued buffers, including those previously * passed to the driver (after waiting for the driver to finish). * * This call can be used for pausing playback. * The return values from this function are intended to be directly returned * from vidioc_streamoff handler in the driver */ int vb2_streamoff(struct vb2_queue *q, enum v4l2_buf_type type); /** * vb2_queue_init() - initialize a videobuf2 queue * @q: pointer to &struct vb2_queue with videobuf2 queue. * * The vb2_queue structure should be allocated by the driver. The driver is * responsible of clearing it's content and setting initial values for some * required entries before calling this function. * q->ops, q->mem_ops, q->type and q->io_modes are mandatory. Please refer * to the struct vb2_queue description in include/media/videobuf2-core.h * for more information. */ int __must_check vb2_queue_init(struct vb2_queue *q); /** * vb2_queue_init_name() - initialize a videobuf2 queue with a name * @q: pointer to &struct vb2_queue with videobuf2 queue. * @name: the queue name * * This function initializes the vb2_queue exactly like vb2_queue_init(), * and additionally sets the queue name. The queue name is used for logging * purpose, and should uniquely identify the queue within the context of the * device it belongs to. This is useful to attribute kernel log messages to the * right queue for m2m devices or other devices that handle multiple queues. */ int __must_check vb2_queue_init_name(struct vb2_queue *q, const char *name); /** * vb2_queue_release() - stop streaming, release the queue and free memory * @q: pointer to &struct vb2_queue with videobuf2 queue. * * This function stops streaming and performs necessary clean ups, including * freeing video buffer memory. The driver is responsible for freeing * the vb2_queue structure itself. */ void vb2_queue_release(struct vb2_queue *q); /** * vb2_queue_change_type() - change the type of an inactive vb2_queue * @q: pointer to &struct vb2_queue with videobuf2 queue. * @type: the type to change to (V4L2_BUF_TYPE_VIDEO_*) * * This function changes the type of the vb2_queue. This is only possible * if the queue is not busy (i.e. no buffers have been allocated). * * vb2_queue_change_type() can be used to support multiple buffer types using * the same queue. The driver can implement v4l2_ioctl_ops.vidioc_reqbufs and * v4l2_ioctl_ops.vidioc_create_bufs functions and call vb2_queue_change_type() * before calling vb2_ioctl_reqbufs() or vb2_ioctl_create_bufs(), and thus * "lock" the buffer type until the buffers have been released. */ int vb2_queue_change_type(struct vb2_queue *q, unsigned int type); /** * vb2_poll() - implements poll userspace operation * @q: pointer to &struct vb2_queue with videobuf2 queue. * @file: file argument passed to the poll file operation handler * @wait: wait argument passed to the poll file operation handler * * This function implements poll file operation handler for a driver. * For CAPTURE queues, if a buffer is ready to be dequeued, the userspace will * be informed that the file descriptor of a video device is available for * reading. * For OUTPUT queues, if a buffer is ready to be dequeued, the file descriptor * will be reported as available for writing. * * If the driver uses struct v4l2_fh, then vb2_poll() will also check for any * pending events. * * The return values from this function are intended to be directly returned * from poll handler in driver. */ __poll_t vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait); /* * The following functions are not part of the vb2 core API, but are simple * helper functions that you can use in your struct v4l2_file_operations, * struct v4l2_ioctl_ops and struct vb2_ops. They will serialize if vb2_queue->lock * or video_device->lock is set, and they will set and test the queue owner * (vb2_queue->owner) to check if the calling filehandle is permitted to do the * queuing operation. */ /** * vb2_queue_is_busy() - check if the queue is busy * @q: pointer to &struct vb2_queue with videobuf2 queue. * @file: file through which the vb2 queue access is performed * * The queue is considered busy if it has an owner and the owner is not the * @file. * * Queue ownership is acquired and checked by some of the v4l2_ioctl_ops helpers * below. Drivers can also use this function directly when they need to * open-code ioctl handlers, for instance to add additional checks between the * queue ownership test and the call to the corresponding vb2 operation. */ static inline bool vb2_queue_is_busy(struct vb2_queue *q, struct file *file) { return q->owner && q->owner != file->private_data; } /* struct v4l2_ioctl_ops helpers */ int vb2_ioctl_reqbufs(struct file *file, void *priv, struct v4l2_requestbuffers *p); int vb2_ioctl_create_bufs(struct file *file, void *priv, struct v4l2_create_buffers *p); int vb2_ioctl_prepare_buf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_querybuf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_qbuf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_dqbuf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_streamon(struct file *file, void *priv, enum v4l2_buf_type i); int vb2_ioctl_streamoff(struct file *file, void *priv, enum v4l2_buf_type i); int vb2_ioctl_expbuf(struct file *file, void *priv, struct v4l2_exportbuffer *p); int vb2_ioctl_remove_bufs(struct file *file, void *priv, struct v4l2_remove_buffers *p); /* struct v4l2_file_operations helpers */ int vb2_fop_mmap(struct file *file, struct vm_area_struct *vma); int vb2_fop_release(struct file *file); int _vb2_fop_release(struct file *file, struct mutex *lock); ssize_t vb2_fop_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos); ssize_t vb2_fop_read(struct file *file, char __user *buf, size_t count, loff_t *ppos); __poll_t vb2_fop_poll(struct file *file, poll_table *wait); #ifndef CONFIG_MMU unsigned long vb2_fop_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif /** * vb2_video_unregister_device - unregister the video device and release queue * * @vdev: pointer to &struct video_device * * If the driver uses vb2_fop_release()/_vb2_fop_release(), then it should use * vb2_video_unregister_device() instead of video_unregister_device(). * * This function will call video_unregister_device() and then release the * vb2_queue if streaming is in progress. This will stop streaming and * this will simplify the unbind sequence since after this call all subdevs * will have stopped streaming as well. */ void vb2_video_unregister_device(struct video_device *vdev); /** * vb2_ops_wait_prepare - helper function to lock a struct &vb2_queue * * @vq: pointer to &struct vb2_queue * * ..note:: only use if vq->lock is non-NULL. */ void vb2_ops_wait_prepare(struct vb2_queue *vq); /** * vb2_ops_wait_finish - helper function to unlock a struct &vb2_queue * * @vq: pointer to &struct vb2_queue * * ..note:: only use if vq->lock is non-NULL. */ void vb2_ops_wait_finish(struct vb2_queue *vq); struct media_request; int vb2_request_validate(struct media_request *req); void vb2_request_queue(struct media_request *req); #endif /* _MEDIA_VIDEOBUF2_V4L2_H */ |
| 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * fence-array: aggregates fence to be waited together * * Copyright (C) 2016 Collabora Ltd * Copyright (C) 2016 Advanced Micro Devices, Inc. * Authors: * Gustavo Padovan <gustavo@padovan.org> * Christian König <christian.koenig@amd.com> */ #ifndef __LINUX_DMA_FENCE_ARRAY_H #define __LINUX_DMA_FENCE_ARRAY_H #include <linux/dma-fence.h> #include <linux/irq_work.h> /** * struct dma_fence_array_cb - callback helper for fence array * @cb: fence callback structure for signaling * @array: reference to the parent fence array object */ struct dma_fence_array_cb { struct dma_fence_cb cb; struct dma_fence_array *array; }; /** * struct dma_fence_array - fence to represent an array of fences * @base: fence base class * @lock: spinlock for fence handling * @num_fences: number of fences in the array * @num_pending: fences in the array still pending * @fences: array of the fences * @work: internal irq_work function * @callbacks: array of callback helpers */ struct dma_fence_array { struct dma_fence base; spinlock_t lock; unsigned num_fences; atomic_t num_pending; struct dma_fence **fences; struct irq_work work; struct dma_fence_array_cb callbacks[] __counted_by(num_fences); }; /** * to_dma_fence_array - cast a fence to a dma_fence_array * @fence: fence to cast to a dma_fence_array * * Returns NULL if the fence is not a dma_fence_array, * or the dma_fence_array otherwise. */ static inline struct dma_fence_array * to_dma_fence_array(struct dma_fence *fence) { if (!fence || !dma_fence_is_array(fence)) return NULL; return container_of(fence, struct dma_fence_array, base); } /** * dma_fence_array_for_each - iterate over all fences in array * @fence: current fence * @index: index into the array * @head: potential dma_fence_array object * * Test if @array is a dma_fence_array object and if yes iterate over all fences * in the array. If not just iterate over the fence in @array itself. * * For a deep dive iterator see dma_fence_unwrap_for_each(). */ #define dma_fence_array_for_each(fence, index, head) \ for (index = 0, fence = dma_fence_array_first(head); fence; \ ++(index), fence = dma_fence_array_next(head, index)) struct dma_fence_array *dma_fence_array_alloc(int num_fences); void dma_fence_array_init(struct dma_fence_array *array, int num_fences, struct dma_fence **fences, u64 context, unsigned seqno, bool signal_on_any); struct dma_fence_array *dma_fence_array_create(int num_fences, struct dma_fence **fences, u64 context, unsigned seqno, bool signal_on_any); bool dma_fence_match_context(struct dma_fence *fence, u64 context); struct dma_fence *dma_fence_array_first(struct dma_fence *head); struct dma_fence *dma_fence_array_next(struct dma_fence *head, unsigned int index); #endif /* __LINUX_DMA_FENCE_ARRAY_H */ |
| 2 3 3 2 2 3 1 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 | // SPDX-License-Identifier: GPL-2.0-only // // ethtool interface for Ethernet PSE (Power Sourcing Equipment) // and PD (Powered Device) // // Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> // #include "common.h" #include "linux/pse-pd/pse.h" #include "netlink.h" #include <linux/ethtool_netlink.h> #include <linux/ethtool.h> #include <linux/export.h> #include <linux/phy.h> struct pse_req_info { struct ethnl_req_info base; }; struct pse_reply_data { struct ethnl_reply_data base; struct ethtool_pse_control_status status; }; #define PSE_REPDATA(__reply_base) \ container_of(__reply_base, struct pse_reply_data, base) /* PSE_GET */ const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1] = { [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), }; static int pse_get_pse_attributes(struct phy_device *phydev, struct netlink_ext_ack *extack, struct pse_reply_data *data) { if (!phydev) { NL_SET_ERR_MSG(extack, "No PHY found"); return -EOPNOTSUPP; } if (!phydev->psec) { NL_SET_ERR_MSG(extack, "No PSE is attached"); return -EOPNOTSUPP; } memset(&data->status, 0, sizeof(data->status)); return pse_ethtool_get_status(phydev->psec, extack, &data->status); } static int pse_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct pse_reply_data *data = PSE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; struct nlattr **tb = info->attrs; struct phy_device *phydev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PSE_HEADER, info->extack); if (IS_ERR(phydev)) return -ENODEV; ret = pse_get_pse_attributes(phydev, info->extack, data); ethnl_ops_complete(dev); return ret; } static int pse_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pse_reply_data *data = PSE_REPDATA(reply_base); const struct ethtool_pse_control_status *st = &data->status; int len = 0; if (st->pw_d_id) len += nla_total_size(sizeof(u32)); /* _PSE_PW_D_ID */ if (st->podl_admin_state > 0) len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */ if (st->podl_pw_status > 0) len += nla_total_size(sizeof(u32)); /* _PODL_PSE_PW_D_STATUS */ if (st->c33_admin_state > 0) len += nla_total_size(sizeof(u32)); /* _C33_PSE_ADMIN_STATE */ if (st->c33_pw_status > 0) len += nla_total_size(sizeof(u32)); /* _C33_PSE_PW_D_STATUS */ if (st->c33_pw_class > 0) len += nla_total_size(sizeof(u32)); /* _C33_PSE_PW_CLASS */ if (st->c33_actual_pw > 0) len += nla_total_size(sizeof(u32)); /* _C33_PSE_ACTUAL_PW */ if (st->c33_ext_state_info.c33_pse_ext_state > 0) { len += nla_total_size(sizeof(u32)); /* _C33_PSE_EXT_STATE */ if (st->c33_ext_state_info.__c33_pse_ext_substate > 0) /* _C33_PSE_EXT_SUBSTATE */ len += nla_total_size(sizeof(u32)); } if (st->c33_avail_pw_limit > 0) /* _C33_AVAIL_PSE_PW_LIMIT */ len += nla_total_size(sizeof(u32)); if (st->c33_pw_limit_nb_ranges > 0) /* _C33_PSE_PW_LIMIT_RANGES */ len += st->c33_pw_limit_nb_ranges * (nla_total_size(0) + nla_total_size(sizeof(u32)) * 2); if (st->prio_max) /* _PSE_PRIO_MAX + _PSE_PRIO */ len += nla_total_size(sizeof(u32)) * 2; return len; } static int pse_put_pw_limit_ranges(struct sk_buff *skb, const struct ethtool_pse_control_status *st) { const struct ethtool_c33_pse_pw_limit_range *pw_limit_ranges; int i; pw_limit_ranges = st->c33_pw_limit_ranges; for (i = 0; i < st->c33_pw_limit_nb_ranges; i++) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, pw_limit_ranges->min) || nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, pw_limit_ranges->max)) { nla_nest_cancel(skb, nest); return -EMSGSIZE; } nla_nest_end(skb, nest); pw_limit_ranges++; } return 0; } static int pse_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pse_reply_data *data = PSE_REPDATA(reply_base); const struct ethtool_pse_control_status *st = &data->status; if (st->pw_d_id && nla_put_u32(skb, ETHTOOL_A_PSE_PW_D_ID, st->pw_d_id)) return -EMSGSIZE; if (st->podl_admin_state > 0 && nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE, st->podl_admin_state)) return -EMSGSIZE; if (st->podl_pw_status > 0 && nla_put_u32(skb, ETHTOOL_A_PODL_PSE_PW_D_STATUS, st->podl_pw_status)) return -EMSGSIZE; if (st->c33_admin_state > 0 && nla_put_u32(skb, ETHTOOL_A_C33_PSE_ADMIN_STATE, st->c33_admin_state)) return -EMSGSIZE; if (st->c33_pw_status > 0 && nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_D_STATUS, st->c33_pw_status)) return -EMSGSIZE; if (st->c33_pw_class > 0 && nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_CLASS, st->c33_pw_class)) return -EMSGSIZE; if (st->c33_actual_pw > 0 && nla_put_u32(skb, ETHTOOL_A_C33_PSE_ACTUAL_PW, st->c33_actual_pw)) return -EMSGSIZE; if (st->c33_ext_state_info.c33_pse_ext_state > 0) { if (nla_put_u32(skb, ETHTOOL_A_C33_PSE_EXT_STATE, st->c33_ext_state_info.c33_pse_ext_state)) return -EMSGSIZE; if (st->c33_ext_state_info.__c33_pse_ext_substate > 0 && nla_put_u32(skb, ETHTOOL_A_C33_PSE_EXT_SUBSTATE, st->c33_ext_state_info.__c33_pse_ext_substate)) return -EMSGSIZE; } if (st->c33_avail_pw_limit > 0 && nla_put_u32(skb, ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, st->c33_avail_pw_limit)) return -EMSGSIZE; if (st->c33_pw_limit_nb_ranges > 0 && pse_put_pw_limit_ranges(skb, st)) return -EMSGSIZE; if (st->prio_max && (nla_put_u32(skb, ETHTOOL_A_PSE_PRIO_MAX, st->prio_max) || nla_put_u32(skb, ETHTOOL_A_PSE_PRIO, st->prio))) return -EMSGSIZE; return 0; } static void pse_cleanup_data(struct ethnl_reply_data *reply_base) { const struct pse_reply_data *data = PSE_REPDATA(reply_base); kfree(data->status.c33_pw_limit_ranges); } /* PSE_SET */ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), [ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] = NLA_POLICY_RANGE(NLA_U32, ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED), [ETHTOOL_A_C33_PSE_ADMIN_CONTROL] = NLA_POLICY_RANGE(NLA_U32, ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED, ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED), [ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT] = { .type = NLA_U32 }, [ETHTOOL_A_PSE_PRIO] = { .type = NLA_U32 }, }; static int ethnl_set_pse_validate(struct phy_device *phydev, struct genl_info *info) { struct nlattr **tb = info->attrs; if (IS_ERR_OR_NULL(phydev)) { NL_SET_ERR_MSG(info->extack, "No PHY is attached"); return -EOPNOTSUPP; } if (!phydev->psec) { NL_SET_ERR_MSG(info->extack, "No PSE is attached"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] && !pse_has_podl(phydev->psec)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL], "setting PoDL PSE admin control not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL] && !pse_has_c33(phydev->psec)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL], "setting C33 PSE admin control not supported"); return -EOPNOTSUPP; } return 0; } static int ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) { struct nlattr **tb = info->attrs; struct phy_device *phydev; int ret; phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PSE_HEADER, info->extack); ret = ethnl_set_pse_validate(phydev, info); if (ret) return ret; if (tb[ETHTOOL_A_PSE_PRIO]) { unsigned int prio; prio = nla_get_u32(tb[ETHTOOL_A_PSE_PRIO]); ret = pse_ethtool_set_prio(phydev->psec, info->extack, prio); if (ret) return ret; } if (tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]) { unsigned int pw_limit; pw_limit = nla_get_u32(tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]); ret = pse_ethtool_set_pw_limit(phydev->psec, info->extack, pw_limit); if (ret) return ret; } /* These values are already validated by the ethnl_pse_set_policy */ if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] || tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]) { struct pse_control_config config = {}; if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]) config.podl_admin_control = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]); if (tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]) config.c33_admin_control = nla_get_u32(tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]); /* pse_ethtool_set_config() will do nothing if the config * is zero */ ret = pse_ethtool_set_config(phydev->psec, info->extack, &config); if (ret) return ret; } /* Return errno or zero - PSE has no notification */ return ret; } const struct ethnl_request_ops ethnl_pse_request_ops = { .request_cmd = ETHTOOL_MSG_PSE_GET, .reply_cmd = ETHTOOL_MSG_PSE_GET_REPLY, .hdr_attr = ETHTOOL_A_PSE_HEADER, .req_info_size = sizeof(struct pse_req_info), .reply_data_size = sizeof(struct pse_reply_data), .prepare_data = pse_prepare_data, .reply_size = pse_reply_size, .fill_reply = pse_fill_reply, .cleanup_data = pse_cleanup_data, .set = ethnl_set_pse, /* PSE has no notification */ }; void ethnl_pse_send_ntf(struct net_device *netdev, unsigned long notifs) { void *reply_payload; struct sk_buff *skb; int reply_len; int ret; ASSERT_RTNL(); if (!netdev || !notifs) return; reply_len = ethnl_reply_header_size() + nla_total_size(sizeof(u32)); /* _PSE_NTF_EVENTS */ skb = genlmsg_new(reply_len, GFP_KERNEL); if (!skb) return; reply_payload = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_PSE_NTF); if (!reply_payload) goto err_skb; ret = ethnl_fill_reply_header(skb, netdev, ETHTOOL_A_PSE_NTF_HEADER); if (ret < 0) goto err_skb; if (nla_put_uint(skb, ETHTOOL_A_PSE_NTF_EVENTS, notifs)) goto err_skb; genlmsg_end(skb, reply_payload); ethnl_multicast(skb, netdev); return; err_skb: nlmsg_free(skb); } EXPORT_SYMBOL_GPL(ethnl_pse_send_ntf); |
| 6 6 232 230 232 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 | /* * Copyright (C) 2016 Red Hat * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Rob Clark <robdclark@gmail.com> */ #include <linux/debugfs.h> #include <linux/dynamic_debug.h> #include <linux/export.h> #include <linux/io.h> #include <linux/moduleparam.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/stdarg.h> #include <drm/drm.h> #include <drm/drm_drv.h> #include <drm/drm_print.h> /* * __drm_debug: Enable debug output. * Bitmask of DRM_UT_x. See include/drm/drm_print.h for details. */ unsigned long __drm_debug; EXPORT_SYMBOL(__drm_debug); MODULE_PARM_DESC(debug, "Enable debug output, where each bit enables a debug category.\n" "\t\tBit 0 (0x01) will enable CORE messages (drm core code)\n" "\t\tBit 1 (0x02) will enable DRIVER messages (drm controller code)\n" "\t\tBit 2 (0x04) will enable KMS messages (modesetting code)\n" "\t\tBit 3 (0x08) will enable PRIME messages (prime code)\n" "\t\tBit 4 (0x10) will enable ATOMIC messages (atomic code)\n" "\t\tBit 5 (0x20) will enable VBL messages (vblank code)\n" "\t\tBit 7 (0x80) will enable LEASE messages (leasing code)\n" "\t\tBit 8 (0x100) will enable DP messages (displayport code)"); #if !defined(CONFIG_DRM_USE_DYNAMIC_DEBUG) module_param_named(debug, __drm_debug, ulong, 0600); #else /* classnames must match vals of enum drm_debug_category */ DECLARE_DYNDBG_CLASSMAP(drm_debug_classes, DD_CLASS_TYPE_DISJOINT_BITS, 0, "DRM_UT_CORE", "DRM_UT_DRIVER", "DRM_UT_KMS", "DRM_UT_PRIME", "DRM_UT_ATOMIC", "DRM_UT_VBL", "DRM_UT_STATE", "DRM_UT_LEASE", "DRM_UT_DP", "DRM_UT_DRMRES"); static struct ddebug_class_param drm_debug_bitmap = { .bits = &__drm_debug, .flags = "p", .map = &drm_debug_classes, }; module_param_cb(debug, ¶m_ops_dyndbg_classes, &drm_debug_bitmap, 0600); #endif void __drm_puts_coredump(struct drm_printer *p, const char *str) { struct drm_print_iterator *iterator = p->arg; ssize_t len; if (!iterator->remain) return; if (iterator->offset < iterator->start) { ssize_t copy; len = strlen(str); if (iterator->offset + len <= iterator->start) { iterator->offset += len; return; } copy = len - (iterator->start - iterator->offset); if (copy > iterator->remain) copy = iterator->remain; /* Copy out the bit of the string that we need */ if (iterator->data) memcpy(iterator->data, str + (iterator->start - iterator->offset), copy); iterator->offset = iterator->start + copy; iterator->remain -= copy; } else { ssize_t pos = iterator->offset - iterator->start; len = min_t(ssize_t, strlen(str), iterator->remain); if (iterator->data) memcpy(iterator->data + pos, str, len); iterator->offset += len; iterator->remain -= len; } } EXPORT_SYMBOL(__drm_puts_coredump); void __drm_printfn_coredump(struct drm_printer *p, struct va_format *vaf) { struct drm_print_iterator *iterator = p->arg; size_t len; char *buf; if (!iterator->remain) return; /* Figure out how big the string will be */ len = snprintf(NULL, 0, "%pV", vaf); /* This is the easiest path, we've already advanced beyond the offset */ if (iterator->offset + len <= iterator->start) { iterator->offset += len; return; } /* Then check if we can directly copy into the target buffer */ if ((iterator->offset >= iterator->start) && (len < iterator->remain)) { ssize_t pos = iterator->offset - iterator->start; if (iterator->data) snprintf(((char *) iterator->data) + pos, iterator->remain, "%pV", vaf); iterator->offset += len; iterator->remain -= len; return; } /* * Finally, hit the slow path and make a temporary string to copy over * using _drm_puts_coredump */ buf = kmalloc(len + 1, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (!buf) return; snprintf(buf, len + 1, "%pV", vaf); __drm_puts_coredump(p, (const char *) buf); kfree(buf); } EXPORT_SYMBOL(__drm_printfn_coredump); void __drm_puts_seq_file(struct drm_printer *p, const char *str) { seq_puts(p->arg, str); } EXPORT_SYMBOL(__drm_puts_seq_file); void __drm_printfn_seq_file(struct drm_printer *p, struct va_format *vaf) { seq_printf(p->arg, "%pV", vaf); } EXPORT_SYMBOL(__drm_printfn_seq_file); static void __drm_dev_vprintk(const struct device *dev, const char *level, const void *origin, const char *prefix, struct va_format *vaf) { const char *prefix_pad = prefix ? " " : ""; if (!prefix) prefix = ""; if (dev) { if (origin) dev_printk(level, dev, "[" DRM_NAME ":%ps]%s%s %pV", origin, prefix_pad, prefix, vaf); else dev_printk(level, dev, "[" DRM_NAME "]%s%s %pV", prefix_pad, prefix, vaf); } else { if (origin) printk("%s" "[" DRM_NAME ":%ps]%s%s %pV", level, origin, prefix_pad, prefix, vaf); else printk("%s" "[" DRM_NAME "]%s%s %pV", level, prefix_pad, prefix, vaf); } } void __drm_printfn_info(struct drm_printer *p, struct va_format *vaf) { dev_info(p->arg, "[" DRM_NAME "] %pV", vaf); } EXPORT_SYMBOL(__drm_printfn_info); void __drm_printfn_dbg(struct drm_printer *p, struct va_format *vaf) { const struct drm_device *drm = p->arg; const struct device *dev = drm ? drm->dev : NULL; enum drm_debug_category category = p->category; if (!__drm_debug_enabled(category)) return; __drm_dev_vprintk(dev, KERN_DEBUG, p->origin, p->prefix, vaf); } EXPORT_SYMBOL(__drm_printfn_dbg); void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf) { struct drm_device *drm = p->arg; if (p->prefix) drm_err(drm, "%s %pV", p->prefix, vaf); else drm_err(drm, "%pV", vaf); } EXPORT_SYMBOL(__drm_printfn_err); void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf) { unsigned int counter = ++p->line.counter; const char *prefix = p->prefix ?: ""; const char *pad = p->prefix ? " " : ""; if (p->line.series) drm_printf(p->arg, "%s%s%u.%u: %pV", prefix, pad, p->line.series, counter, vaf); else drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf); } EXPORT_SYMBOL(__drm_printfn_line); /** * drm_puts - print a const string to a &drm_printer stream * @p: the &drm printer * @str: const string * * Allow &drm_printer types that have a constant string * option to use it. */ void drm_puts(struct drm_printer *p, const char *str) { if (p->puts) p->puts(p, str); else drm_printf(p, "%s", str); } EXPORT_SYMBOL(drm_puts); /** * drm_printf - print to a &drm_printer stream * @p: the &drm_printer * @f: format string */ void drm_printf(struct drm_printer *p, const char *f, ...) { va_list args; va_start(args, f); drm_vprintf(p, f, &args); va_end(args); } EXPORT_SYMBOL(drm_printf); /** * drm_print_bits - print bits to a &drm_printer stream * * Print bits (in flag fields for example) in human readable form. * * @p: the &drm_printer * @value: field value. * @bits: Array with bit names. * @nbits: Size of bit names array. */ void drm_print_bits(struct drm_printer *p, unsigned long value, const char * const bits[], unsigned int nbits) { bool first = true; unsigned int i; if (WARN_ON_ONCE(nbits > BITS_PER_TYPE(value))) nbits = BITS_PER_TYPE(value); for_each_set_bit(i, &value, nbits) { if (WARN_ON_ONCE(!bits[i])) continue; drm_printf(p, "%s%s", first ? "" : ",", bits[i]); first = false; } if (first) drm_printf(p, "(none)"); } EXPORT_SYMBOL(drm_print_bits); void drm_dev_printk(const struct device *dev, const char *level, const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; __drm_dev_vprintk(dev, level, __builtin_return_address(0), NULL, &vaf); va_end(args); } EXPORT_SYMBOL(drm_dev_printk); void __drm_dev_dbg(struct _ddebug *desc, const struct device *dev, enum drm_debug_category category, const char *format, ...) { struct va_format vaf; va_list args; if (!__drm_debug_enabled(category)) return; /* we know we are printing for either syslog, tracefs, or both */ va_start(args, format); vaf.fmt = format; vaf.va = &args; __drm_dev_vprintk(dev, KERN_DEBUG, __builtin_return_address(0), NULL, &vaf); va_end(args); } EXPORT_SYMBOL(__drm_dev_dbg); void __drm_err(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; __drm_dev_vprintk(NULL, KERN_ERR, __builtin_return_address(0), "*ERROR*", &vaf); va_end(args); } EXPORT_SYMBOL(__drm_err); /** * drm_print_regset32 - print the contents of registers to a * &drm_printer stream. * * @p: the &drm printer * @regset: the list of registers to print. * * Often in driver debug, it's useful to be able to either capture the * contents of registers in the steady state using debugfs or at * specific points during operation. This lets the driver have a * single list of registers for both. */ void drm_print_regset32(struct drm_printer *p, struct debugfs_regset32 *regset) { int namelen = 0; int i; for (i = 0; i < regset->nregs; i++) namelen = max(namelen, (int)strlen(regset->regs[i].name)); for (i = 0; i < regset->nregs; i++) { drm_printf(p, "%*s = 0x%08x\n", namelen, regset->regs[i].name, readl(regset->base + regset->regs[i].offset)); } } EXPORT_SYMBOL(drm_print_regset32); /** * drm_print_hex_dump - print a hex dump to a &drm_printer stream * @p: The &drm_printer * @prefix: Prefix for each line, may be NULL for no prefix * @buf: Buffer to dump * @len: Length of buffer * * Print hex dump to &drm_printer, with 16 space-separated hex bytes per line, * optionally with a prefix on each line. No separator is added after prefix. */ void drm_print_hex_dump(struct drm_printer *p, const char *prefix, const u8 *buf, size_t len) { int i; for (i = 0; i < len; i += 16) { int bytes_per_line = min(16, len - i); drm_printf(p, "%s%*ph\n", prefix ?: "", bytes_per_line, buf + i); } } EXPORT_SYMBOL(drm_print_hex_dump); |
| 3 599 101 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UTS_NAMESPACE_H #define _LINUX_UTS_NAMESPACE_H #include <linux/ns_common.h> #include <uapi/linux/utsname.h> struct user_namespace; extern struct user_namespace init_user_ns; struct uts_namespace { struct new_utsname name; struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; } __randomize_layout; extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) { return container_of(ns, struct uts_namespace, ns); } static inline void get_uts_ns(struct uts_namespace *ns) { ns_ref_inc(ns); } extern struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns); extern void free_uts_ns(struct uts_namespace *ns); static inline void put_uts_ns(struct uts_namespace *ns) { if (ns_ref_put(ns)) free_uts_ns(ns); } void uts_ns_init(void); #else static inline void get_uts_ns(struct uts_namespace *ns) { } static inline void put_uts_ns(struct uts_namespace *ns) { } static inline struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { if (flags & CLONE_NEWUTS) return ERR_PTR(-EINVAL); return old_ns; } static inline void uts_ns_init(void) { } #endif #endif /* _LINUX_UTS_NAMESPACE_H */ |
| 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | // SPDX-License-Identifier: GPL-2.0+ #include <drm/drm_atomic_helper.h> #include <drm/drm_edid.h> #include <drm/drm_managed.h> #include <drm/drm_probe_helper.h> #include "vkms_connector.h" static const struct drm_connector_funcs vkms_connector_funcs = { .fill_modes = drm_helper_probe_single_connector_modes, .reset = drm_atomic_helper_connector_reset, .atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state, .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, }; static int vkms_conn_get_modes(struct drm_connector *connector) { int count; /* Use the default modes list from DRM */ count = drm_add_modes_noedid(connector, XRES_MAX, YRES_MAX); drm_set_preferred_mode(connector, XRES_DEF, YRES_DEF); return count; } static struct drm_encoder *vkms_conn_best_encoder(struct drm_connector *connector) { struct drm_encoder *encoder; drm_connector_for_each_possible_encoder(connector, encoder) return encoder; return NULL; } static const struct drm_connector_helper_funcs vkms_conn_helper_funcs = { .get_modes = vkms_conn_get_modes, .best_encoder = vkms_conn_best_encoder, }; struct vkms_connector *vkms_connector_init(struct vkms_device *vkmsdev) { struct drm_device *dev = &vkmsdev->drm; struct vkms_connector *connector; int ret; connector = drmm_kzalloc(dev, sizeof(*connector), GFP_KERNEL); if (!connector) return ERR_PTR(-ENOMEM); ret = drmm_connector_init(dev, &connector->base, &vkms_connector_funcs, DRM_MODE_CONNECTOR_VIRTUAL, NULL); if (ret) return ERR_PTR(ret); drm_connector_helper_add(&connector->base, &vkms_conn_helper_funcs); return connector; } |
| 423 423 423 423 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET4: Sysctl interface to net af_unix subsystem. * * Authors: Mike Shaver. */ #include <linux/slab.h> #include <linux/string.h> #include <linux/sysctl.h> #include <net/af_unix.h> #include <net/net_namespace.h> #include "af_unix.h" static struct ctl_table unix_table[] = { { .procname = "max_dgram_qlen", .data = &init_net.unx.sysctl_max_dgram_qlen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, }; int __net_init unix_sysctl_register(struct net *net) { struct ctl_table *table; if (net_eq(net, &init_net)) { table = unix_table; } else { table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->unx.sysctl_max_dgram_qlen; } net->unx.ctl = register_net_sysctl_sz(net, "net/unix", table, ARRAY_SIZE(unix_table)); if (net->unx.ctl == NULL) goto err_reg; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } void unix_sysctl_unregister(struct net *net) { const struct ctl_table *table; table = net->unx.ctl->ctl_table_arg; unregister_net_sysctl_table(net->unx.ctl); if (!net_eq(net, &init_net)) kfree(table); } |
| 61 63 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | /* * Copyright (C) 2011-2013 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef DRM_RECT_H #define DRM_RECT_H #include <linux/types.h> /** * DOC: rect utils * * Utility functions to help manage rectangular areas for * clipping, scaling, etc. calculations. */ /** * struct drm_rect - two dimensional rectangle * @x1: horizontal starting coordinate (inclusive) * @x2: horizontal ending coordinate (exclusive) * @y1: vertical starting coordinate (inclusive) * @y2: vertical ending coordinate (exclusive) * * Note that this must match the layout of struct drm_mode_rect or the damage * helpers like drm_atomic_helper_damage_iter_init() break. */ struct drm_rect { int x1, y1, x2, y2; }; /** * DRM_RECT_INIT - initialize a rectangle from x/y/w/h * @x: x coordinate * @y: y coordinate * @w: width * @h: height * * RETURNS: * A new rectangle of the specified size. */ #define DRM_RECT_INIT(x, y, w, h) ((struct drm_rect){ \ .x1 = (x), \ .y1 = (y), \ .x2 = (x) + (w), \ .y2 = (y) + (h) }) /** * DRM_RECT_FMT - printf string for &struct drm_rect */ #define DRM_RECT_FMT "%dx%d%+d%+d" /** * DRM_RECT_ARG - printf arguments for &struct drm_rect * @r: rectangle struct */ #define DRM_RECT_ARG(r) drm_rect_width(r), drm_rect_height(r), (r)->x1, (r)->y1 /** * DRM_RECT_FP_FMT - printf string for &struct drm_rect in 16.16 fixed point */ #define DRM_RECT_FP_FMT "%d.%06ux%d.%06u%+d.%06u%+d.%06u" /** * DRM_RECT_FP_ARG - printf arguments for &struct drm_rect in 16.16 fixed point * @r: rectangle struct * * This is useful for e.g. printing plane source rectangles, which are in 16.16 * fixed point. */ #define DRM_RECT_FP_ARG(r) \ drm_rect_width(r) >> 16, ((drm_rect_width(r) & 0xffff) * 15625) >> 10, \ drm_rect_height(r) >> 16, ((drm_rect_height(r) & 0xffff) * 15625) >> 10, \ (r)->x1 >> 16, (((r)->x1 & 0xffff) * 15625) >> 10, \ (r)->y1 >> 16, (((r)->y1 & 0xffff) * 15625) >> 10 /** * drm_rect_init - initialize the rectangle from x/y/w/h * @r: rectangle * @x: x coordinate * @y: y coordinate * @width: width * @height: height */ static inline void drm_rect_init(struct drm_rect *r, int x, int y, int width, int height) { r->x1 = x; r->y1 = y; r->x2 = x + width; r->y2 = y + height; } /** * drm_rect_adjust_size - adjust the size of the rectangle * @r: rectangle to be adjusted * @dw: horizontal adjustment * @dh: vertical adjustment * * Change the size of rectangle @r by @dw in the horizontal direction, * and by @dh in the vertical direction, while keeping the center * of @r stationary. * * Positive @dw and @dh increase the size, negative values decrease it. */ static inline void drm_rect_adjust_size(struct drm_rect *r, int dw, int dh) { r->x1 -= dw >> 1; r->y1 -= dh >> 1; r->x2 += (dw + 1) >> 1; r->y2 += (dh + 1) >> 1; } /** * drm_rect_translate - translate the rectangle * @r: rectangle to be translated * @dx: horizontal translation * @dy: vertical translation * * Move rectangle @r by @dx in the horizontal direction, * and by @dy in the vertical direction. */ static inline void drm_rect_translate(struct drm_rect *r, int dx, int dy) { r->x1 += dx; r->y1 += dy; r->x2 += dx; r->y2 += dy; } /** * drm_rect_translate_to - translate the rectangle to an absolute position * @r: rectangle to be translated * @x: horizontal position * @y: vertical position * * Move rectangle @r to @x in the horizontal direction, * and to @y in the vertical direction. */ static inline void drm_rect_translate_to(struct drm_rect *r, int x, int y) { drm_rect_translate(r, x - r->x1, y - r->y1); } /** * drm_rect_downscale - downscale a rectangle * @r: rectangle to be downscaled * @horz: horizontal downscale factor * @vert: vertical downscale factor * * Divide the coordinates of rectangle @r by @horz and @vert. */ static inline void drm_rect_downscale(struct drm_rect *r, int horz, int vert) { r->x1 /= horz; r->y1 /= vert; r->x2 /= horz; r->y2 /= vert; } /** * drm_rect_width - determine the rectangle width * @r: rectangle whose width is returned * * RETURNS: * The width of the rectangle. */ static inline int drm_rect_width(const struct drm_rect *r) { return r->x2 - r->x1; } /** * drm_rect_height - determine the rectangle height * @r: rectangle whose height is returned * * RETURNS: * The height of the rectangle. */ static inline int drm_rect_height(const struct drm_rect *r) { return r->y2 - r->y1; } /** * drm_rect_visible - determine if the rectangle is visible * @r: rectangle whose visibility is returned * * RETURNS: * %true if the rectangle is visible, %false otherwise. */ static inline bool drm_rect_visible(const struct drm_rect *r) { return drm_rect_width(r) > 0 && drm_rect_height(r) > 0; } /** * drm_rect_equals - determine if two rectangles are equal * @r1: first rectangle * @r2: second rectangle * * RETURNS: * %true if the rectangles are equal, %false otherwise. */ static inline bool drm_rect_equals(const struct drm_rect *r1, const struct drm_rect *r2) { return r1->x1 == r2->x1 && r1->x2 == r2->x2 && r1->y1 == r2->y1 && r1->y2 == r2->y2; } /** * drm_rect_fp_to_int - Convert a rect in 16.16 fixed point form to int form. * @dst: rect to be stored the converted value * @src: rect in 16.16 fixed point form */ static inline void drm_rect_fp_to_int(struct drm_rect *dst, const struct drm_rect *src) { drm_rect_init(dst, src->x1 >> 16, src->y1 >> 16, drm_rect_width(src) >> 16, drm_rect_height(src) >> 16); } /** * drm_rect_overlap - Check if two rectangles overlap * @a: first rectangle * @b: second rectangle * * RETURNS: * %true if the rectangles overlap, %false otherwise. */ static inline bool drm_rect_overlap(const struct drm_rect *a, const struct drm_rect *b) { return (a->x2 > b->x1 && b->x2 > a->x1 && a->y2 > b->y1 && b->y2 > a->y1); } bool drm_rect_intersect(struct drm_rect *r, const struct drm_rect *clip); bool drm_rect_clip_scaled(struct drm_rect *src, struct drm_rect *dst, const struct drm_rect *clip); int drm_rect_calc_hscale(const struct drm_rect *src, const struct drm_rect *dst, int min_hscale, int max_hscale); int drm_rect_calc_vscale(const struct drm_rect *src, const struct drm_rect *dst, int min_vscale, int max_vscale); void drm_rect_debug_print(const char *prefix, const struct drm_rect *r, bool fixed_point); void drm_rect_rotate(struct drm_rect *r, int width, int height, unsigned int rotation); void drm_rect_rotate_inv(struct drm_rect *r, int width, int height, unsigned int rotation); #endif |
| 8 8 8 18 18 18 8 8 8 8 6 8 8 2 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 10 10 10 10 20 20 20 21 20 31 61 61 61 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2016 Qualcomm Atheros, Inc * * Based on net/sched/sch_fq_codel.c */ #ifndef __NET_SCHED_FQ_IMPL_H #define __NET_SCHED_FQ_IMPL_H #include <net/fq.h> /* functions that are embedded into includer */ static void __fq_adjust_removal(struct fq *fq, struct fq_flow *flow, unsigned int packets, unsigned int bytes, unsigned int truesize) { struct fq_tin *tin = flow->tin; int idx; tin->backlog_bytes -= bytes; tin->backlog_packets -= packets; flow->backlog -= bytes; fq->backlog -= packets; fq->memory_usage -= truesize; if (flow->backlog) return; if (flow == &tin->default_flow) { list_del_init(&tin->tin_list); return; } idx = flow - fq->flows; __clear_bit(idx, fq->flows_bitmap); } static void fq_adjust_removal(struct fq *fq, struct fq_flow *flow, struct sk_buff *skb) { __fq_adjust_removal(fq, flow, 1, skb->len, skb->truesize); } static struct sk_buff *fq_flow_dequeue(struct fq *fq, struct fq_flow *flow) { struct sk_buff *skb; lockdep_assert_held(&fq->lock); skb = __skb_dequeue(&flow->queue); if (!skb) return NULL; fq_adjust_removal(fq, flow, skb); return skb; } static int fq_flow_drop(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { unsigned int packets = 0, bytes = 0, truesize = 0; struct fq_tin *tin = flow->tin; struct sk_buff *skb; int pending; lockdep_assert_held(&fq->lock); pending = min_t(int, 32, skb_queue_len(&flow->queue) / 2); do { skb = __skb_dequeue(&flow->queue); if (!skb) break; packets++; bytes += skb->len; truesize += skb->truesize; free_func(fq, tin, flow, skb); } while (packets < pending); __fq_adjust_removal(fq, flow, packets, bytes, truesize); return packets; } static struct sk_buff *fq_tin_dequeue(struct fq *fq, struct fq_tin *tin, fq_tin_dequeue_t dequeue_func) { struct fq_flow *flow; struct list_head *head; struct sk_buff *skb; lockdep_assert_held(&fq->lock); begin: head = &tin->new_flows; if (list_empty(head)) { head = &tin->old_flows; if (list_empty(head)) return NULL; } flow = list_first_entry(head, struct fq_flow, flowchain); if (flow->deficit <= 0) { flow->deficit += fq->quantum; list_move_tail(&flow->flowchain, &tin->old_flows); goto begin; } skb = dequeue_func(fq, tin, flow); if (!skb) { /* force a pass through old_flows to prevent starvation */ if ((head == &tin->new_flows) && !list_empty(&tin->old_flows)) { list_move_tail(&flow->flowchain, &tin->old_flows); } else { list_del_init(&flow->flowchain); flow->tin = NULL; } goto begin; } flow->deficit -= skb->len; tin->tx_bytes += skb->len; tin->tx_packets++; return skb; } static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb) { u32 hash = skb_get_hash(skb); return reciprocal_scale(hash, fq->flows_cnt); } static struct fq_flow *fq_flow_classify(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb) { struct fq_flow *flow; lockdep_assert_held(&fq->lock); flow = &fq->flows[idx]; if (flow->tin && flow->tin != tin) { flow = &tin->default_flow; tin->collisions++; fq->collisions++; } if (!flow->tin) tin->flows++; return flow; } static struct fq_flow *fq_find_fattest_flow(struct fq *fq) { struct fq_tin *tin; struct fq_flow *flow = NULL; u32 len = 0; int i; for_each_set_bit(i, fq->flows_bitmap, fq->flows_cnt) { struct fq_flow *cur = &fq->flows[i]; unsigned int cur_len; cur_len = cur->backlog; if (cur_len <= len) continue; flow = cur; len = cur_len; } list_for_each_entry(tin, &fq->tin_backlog, tin_list) { unsigned int cur_len = tin->default_flow.backlog; if (cur_len <= len) continue; flow = &tin->default_flow; len = cur_len; } return flow; } static void fq_tin_enqueue(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb, fq_skb_free_t free_func) { struct fq_flow *flow; struct sk_buff *next; bool oom; lockdep_assert_held(&fq->lock); flow = fq_flow_classify(fq, tin, idx, skb); if (!flow->backlog) { if (flow != &tin->default_flow) __set_bit(idx, fq->flows_bitmap); else if (list_empty(&tin->tin_list)) list_add(&tin->tin_list, &fq->tin_backlog); } flow->tin = tin; skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); flow->backlog += skb->len; tin->backlog_bytes += skb->len; tin->backlog_packets++; fq->memory_usage += skb->truesize; fq->backlog++; __skb_queue_tail(&flow->queue, skb); } if (list_empty(&flow->flowchain)) { flow->deficit = fq->quantum; list_add_tail(&flow->flowchain, &tin->new_flows); } oom = (fq->memory_usage > fq->memory_limit); while (fq->backlog > fq->limit || oom) { flow = fq_find_fattest_flow(fq); if (!flow) return; if (!fq_flow_drop(fq, flow, free_func)) return; flow->tin->overlimit++; fq->overlimit++; if (oom) { fq->overmemory++; oom = (fq->memory_usage > fq->memory_limit); } } } static void fq_flow_filter(struct fq *fq, struct fq_flow *flow, fq_skb_filter_t filter_func, void *filter_data, fq_skb_free_t free_func) { struct fq_tin *tin = flow->tin; struct sk_buff *skb, *tmp; lockdep_assert_held(&fq->lock); skb_queue_walk_safe(&flow->queue, skb, tmp) { if (!filter_func(fq, tin, flow, skb, filter_data)) continue; __skb_unlink(skb, &flow->queue); fq_adjust_removal(fq, flow, skb); free_func(fq, tin, flow, skb); } } static void fq_tin_filter(struct fq *fq, struct fq_tin *tin, fq_skb_filter_t filter_func, void *filter_data, fq_skb_free_t free_func) { struct fq_flow *flow; lockdep_assert_held(&fq->lock); list_for_each_entry(flow, &tin->new_flows, flowchain) fq_flow_filter(fq, flow, filter_func, filter_data, free_func); list_for_each_entry(flow, &tin->old_flows, flowchain) fq_flow_filter(fq, flow, filter_func, filter_data, free_func); } static void fq_flow_reset(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { struct fq_tin *tin = flow->tin; struct sk_buff *skb; while ((skb = fq_flow_dequeue(fq, flow))) free_func(fq, tin, flow, skb); if (!list_empty(&flow->flowchain)) { list_del_init(&flow->flowchain); if (list_empty(&tin->new_flows) && list_empty(&tin->old_flows)) list_del_init(&tin->tin_list); } flow->tin = NULL; WARN_ON_ONCE(flow->backlog); } static void fq_tin_reset(struct fq *fq, struct fq_tin *tin, fq_skb_free_t free_func) { struct list_head *head; struct fq_flow *flow; for (;;) { head = &tin->new_flows; if (list_empty(head)) { head = &tin->old_flows; if (list_empty(head)) break; } flow = list_first_entry(head, struct fq_flow, flowchain); fq_flow_reset(fq, flow, free_func); } WARN_ON_ONCE(!list_empty(&tin->tin_list)); WARN_ON_ONCE(tin->backlog_bytes); WARN_ON_ONCE(tin->backlog_packets); } static void fq_flow_init(struct fq_flow *flow) { INIT_LIST_HEAD(&flow->flowchain); __skb_queue_head_init(&flow->queue); } static void fq_tin_init(struct fq_tin *tin) { INIT_LIST_HEAD(&tin->new_flows); INIT_LIST_HEAD(&tin->old_flows); INIT_LIST_HEAD(&tin->tin_list); fq_flow_init(&tin->default_flow); } static int fq_init(struct fq *fq, int flows_cnt) { int i; memset(fq, 0, sizeof(fq[0])); spin_lock_init(&fq->lock); INIT_LIST_HEAD(&fq->tin_backlog); fq->flows_cnt = max_t(u32, flows_cnt, 1); fq->quantum = 300; fq->limit = 8192; fq->memory_limit = 16 << 20; /* 16 MBytes */ fq->flows = kvcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL); if (!fq->flows) return -ENOMEM; fq->flows_bitmap = bitmap_zalloc(fq->flows_cnt, GFP_KERNEL); if (!fq->flows_bitmap) { kvfree(fq->flows); fq->flows = NULL; return -ENOMEM; } for (i = 0; i < fq->flows_cnt; i++) fq_flow_init(&fq->flows[i]); return 0; } static void fq_reset(struct fq *fq, fq_skb_free_t free_func) { int i; for (i = 0; i < fq->flows_cnt; i++) fq_flow_reset(fq, &fq->flows[i], free_func); kvfree(fq->flows); fq->flows = NULL; bitmap_free(fq->flows_bitmap); fq->flows_bitmap = NULL; } #endif |
| 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" /** * struct devlink_resource - devlink resource * @name: name of the resource * @id: id, per devlink instance * @size: size of the resource * @size_new: updated size of the resource, reload is needed * @size_valid: valid in case the total size of the resource is valid * including its children * @parent: parent resource * @size_params: size parameters * @list: parent list * @resource_list: list of child resources * @occ_get: occupancy getter callback * @occ_get_priv: occupancy getter callback priv */ struct devlink_resource { const char *name; u64 id; u64 size; u64 size_new; bool size_valid; struct devlink_resource *parent; struct devlink_resource_size_params size_params; struct list_head list; struct list_head resource_list; devlink_resource_occ_get_t *occ_get; void *occ_get_priv; }; static struct devlink_resource * devlink_resource_find(struct devlink *devlink, struct devlink_resource *resource, u64 resource_id) { struct list_head *resource_list; if (resource) resource_list = &resource->resource_list; else resource_list = &devlink->resource_list; list_for_each_entry(resource, resource_list, list) { struct devlink_resource *child_resource; if (resource->id == resource_id) return resource; child_resource = devlink_resource_find(devlink, resource, resource_id); if (child_resource) return child_resource; } return NULL; } static void devlink_resource_validate_children(struct devlink_resource *resource) { struct devlink_resource *child_resource; bool size_valid = true; u64 parts_size = 0; if (list_empty(&resource->resource_list)) goto out; list_for_each_entry(child_resource, &resource->resource_list, list) parts_size += child_resource->size_new; if (parts_size > resource->size_new) size_valid = false; out: resource->size_valid = size_valid; } static int devlink_resource_validate_size(struct devlink_resource *resource, u64 size, struct netlink_ext_ack *extack) { u64 reminder; int err = 0; if (size > resource->size_params.size_max) { NL_SET_ERR_MSG(extack, "Size larger than maximum"); err = -EINVAL; } if (size < resource->size_params.size_min) { NL_SET_ERR_MSG(extack, "Size smaller than minimum"); err = -EINVAL; } div64_u64_rem(size, resource->size_params.size_granularity, &reminder); if (reminder) { NL_SET_ERR_MSG(extack, "Wrong granularity"); err = -EINVAL; } return err; } int devlink_nl_resource_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_resource *resource; u64 resource_id; u64 size; int err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) || GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE)) return -EINVAL; resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); resource = devlink_resource_find(devlink, NULL, resource_id); if (!resource) return -EINVAL; size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); err = devlink_resource_validate_size(resource, size, info->extack); if (err) return err; resource->size_new = size; devlink_resource_validate_children(resource); if (resource->parent) devlink_resource_validate_children(resource->parent); return 0; } static int devlink_resource_size_params_put(struct devlink_resource *resource, struct sk_buff *skb) { struct devlink_resource_size_params *size_params; size_params = &resource->size_params; if (devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, size_params->size_granularity) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, size_params->size_max) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, size_params->size_min) || nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit)) return -EMSGSIZE; return 0; } static int devlink_resource_occ_put(struct devlink_resource *resource, struct sk_buff *skb) { if (!resource->occ_get) return 0; return devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_OCC, resource->occ_get(resource->occ_get_priv)); } static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, struct devlink_resource *resource) { struct devlink_resource *child_resource; struct nlattr *child_resource_attr; struct nlattr *resource_attr; resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE); if (!resource_attr) return -EMSGSIZE; if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id)) goto nla_put_failure; if (resource->size != resource->size_new && devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, resource->size_new)) goto nla_put_failure; if (devlink_resource_occ_put(resource, skb)) goto nla_put_failure; if (devlink_resource_size_params_put(resource, skb)) goto nla_put_failure; if (list_empty(&resource->resource_list)) goto out; if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, resource->size_valid)) goto nla_put_failure; child_resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE_LIST); if (!child_resource_attr) goto nla_put_failure; list_for_each_entry(child_resource, &resource->resource_list, list) { if (devlink_resource_put(devlink, skb, child_resource)) goto resource_put_failure; } nla_nest_end(skb, child_resource_attr); out: nla_nest_end(skb, resource_attr); return 0; resource_put_failure: nla_nest_cancel(skb, child_resource_attr); nla_put_failure: nla_nest_cancel(skb, resource_attr); return -EMSGSIZE; } static int devlink_resource_fill(struct genl_info *info, enum devlink_command cmd, int flags) { struct devlink *devlink = info->user_ptr[0]; struct devlink_resource *resource; struct nlattr *resources_attr; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; bool incomplete; void *hdr; int i; int err; resource = list_first_entry(&devlink->resource_list, struct devlink_resource, list); start_again: err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &devlink_nl_family, NLM_F_MULTI, cmd); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (devlink_nl_put_handle(skb, devlink)) goto nla_put_failure; resources_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE_LIST); if (!resources_attr) goto nla_put_failure; incomplete = false; i = 0; list_for_each_entry_from(resource, &devlink->resource_list, list) { err = devlink_resource_put(devlink, skb, resource); if (err) { if (!i) goto err_resource_put; incomplete = true; break; } i++; } nla_nest_end(skb, resources_attr); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; goto send_done; } return genlmsg_reply(skb, info); nla_put_failure: err = -EMSGSIZE; err_resource_put: nlmsg_free(skb); return err; } int devlink_nl_resource_dump_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; if (list_empty(&devlink->resource_list)) return -EOPNOTSUPP; return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); } int devlink_resources_validate(struct devlink *devlink, struct devlink_resource *resource, struct genl_info *info) { struct list_head *resource_list; int err = 0; if (resource) resource_list = &resource->resource_list; else resource_list = &devlink->resource_list; list_for_each_entry(resource, resource_list, list) { if (!resource->size_valid) return -EINVAL; err = devlink_resources_validate(devlink, resource, info); if (err) return err; } return err; } /** * devl_resource_register - devlink resource register * * @devlink: devlink * @resource_name: resource's name * @resource_size: resource's size * @resource_id: resource's id * @parent_resource_id: resource's parent id * @size_params: size parameters * * Generic resources should reuse the same names across drivers. * Please see the generic resources list at: * Documentation/networking/devlink/devlink-resource.rst */ int devl_resource_register(struct devlink *devlink, const char *resource_name, u64 resource_size, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params) { struct devlink_resource *resource; struct list_head *resource_list; bool top_hierarchy; lockdep_assert_held(&devlink->lock); top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP; resource = devlink_resource_find(devlink, NULL, resource_id); if (resource) return -EEXIST; resource = kzalloc(sizeof(*resource), GFP_KERNEL); if (!resource) return -ENOMEM; if (top_hierarchy) { resource_list = &devlink->resource_list; } else { struct devlink_resource *parent_resource; parent_resource = devlink_resource_find(devlink, NULL, parent_resource_id); if (parent_resource) { resource_list = &parent_resource->resource_list; resource->parent = parent_resource; } else { kfree(resource); return -EINVAL; } } resource->name = resource_name; resource->size = resource_size; resource->size_new = resource_size; resource->id = resource_id; resource->size_valid = true; memcpy(&resource->size_params, size_params, sizeof(resource->size_params)); INIT_LIST_HEAD(&resource->resource_list); list_add_tail(&resource->list, resource_list); return 0; } EXPORT_SYMBOL_GPL(devl_resource_register); static void devlink_resource_unregister(struct devlink *devlink, struct devlink_resource *resource) { struct devlink_resource *tmp, *child_resource; list_for_each_entry_safe(child_resource, tmp, &resource->resource_list, list) { devlink_resource_unregister(devlink, child_resource); list_del(&child_resource->list); kfree(child_resource); } } /** * devl_resources_unregister - free all resources * * @devlink: devlink */ void devl_resources_unregister(struct devlink *devlink) { struct devlink_resource *tmp, *child_resource; lockdep_assert_held(&devlink->lock); list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list, list) { devlink_resource_unregister(devlink, child_resource); list_del(&child_resource->list); kfree(child_resource); } } EXPORT_SYMBOL_GPL(devl_resources_unregister); /** * devlink_resources_unregister - free all resources * * @devlink: devlink * * Context: Takes and release devlink->lock <mutex>. */ void devlink_resources_unregister(struct devlink *devlink) { devl_lock(devlink); devl_resources_unregister(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_resources_unregister); /** * devl_resource_size_get - get and update size * * @devlink: devlink * @resource_id: the requested resource id * @p_resource_size: ptr to update */ int devl_resource_size_get(struct devlink *devlink, u64 resource_id, u64 *p_resource_size) { struct devlink_resource *resource; lockdep_assert_held(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (!resource) return -EINVAL; *p_resource_size = resource->size_new; resource->size = resource->size_new; return 0; } EXPORT_SYMBOL_GPL(devl_resource_size_get); /** * devl_resource_occ_get_register - register occupancy getter * * @devlink: devlink * @resource_id: resource id * @occ_get: occupancy getter callback * @occ_get_priv: occupancy getter callback priv */ void devl_resource_occ_get_register(struct devlink *devlink, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv) { struct devlink_resource *resource; lockdep_assert_held(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (WARN_ON(!resource)) return; WARN_ON(resource->occ_get); resource->occ_get = occ_get; resource->occ_get_priv = occ_get_priv; } EXPORT_SYMBOL_GPL(devl_resource_occ_get_register); /** * devl_resource_occ_get_unregister - unregister occupancy getter * * @devlink: devlink * @resource_id: resource id */ void devl_resource_occ_get_unregister(struct devlink *devlink, u64 resource_id) { struct devlink_resource *resource; lockdep_assert_held(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (WARN_ON(!resource)) return; WARN_ON(!resource->occ_get); resource->occ_get = NULL; resource->occ_get_priv = NULL; } EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister); |
| 5082 762 619 98 427 388 425 427 426 426 425 427 388 388 427 388 427 426 427 425 427 388 388 388 409 265 266 410 220 219 27 220 375 194 375 194 157 147 157 157 147 212 212 11 11 11 11 6 6 6 26 26 26 26 593 595 596 597 25 25 20 20 20 3602 3600 3599 257 257 257 1089 1088 77 76 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 | // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> #include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/fixmap.h> #include <asm/mtrr.h> #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); SYM_PIC_ALIAS(physical_mask); #endif pgtable_t pte_alloc_one(struct mm_struct *mm) { return __pte_alloc_one(mm, GFP_PGTABLE_USER); } void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { paravirt_release_pte(page_to_pfn(pte)); tlb_remove_ptdesc(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table * entries need a full cr3 reload to flush. */ #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_del(&ptdesc->pt_list); } static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* PAE preallocates all its PMDs. No cloning needed. */ if (!IS_ENABLED(CONFIG_X86_PAE)) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); /* List used to sync kernel mapping updates */ pgd_set_mm(pgd, mm); pgd_list_add(pgd); } static void pgd_dtor(pgd_t *pgd) { spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); } /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the * kernel pmd is shared. If PAE were not to share the pmd a similar * tactic would be needed. This is essentially codepath-based locking * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. * -- nyc */ #ifdef CONFIG_X86_PAE /* * In PAE mode, we need to do a cr3 reload (=tlb flush) when * updating the top-level pagetable entries to guarantee the * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. */ #define PREALLOCATED_PMDS PTRS_PER_PGD /* * "USER_PMDS" are the PMDs for the user copy of the page tables when * PTI is enabled. They do not exist when PTI is disabled. Note that * this is distinct from the user _portion_ of the kernel page tables * which always exists. * * We allocate separate PMDs for the kernel part of the user page-table * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. */ #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); /* Note: almost everything apart from _PAGE_PRESENT is reserved at the pmd (PDPT) level. */ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); /* * According to Intel App note "TLBs, Paging-Structure Caches, * and Their Invalidation", April 2007, document 317080-001, * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 #define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; struct ptdesc *ptdesc; for (i = 0; i < count; i++) if (pmds[i]) { ptdesc = virt_to_ptdesc(pmds[i]); pagetable_dtor(ptdesc); pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } } static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; gfp &= ~__GFP_HIGHMEM; for (i = 0; i < count; i++) { pmd_t *pmd = NULL; struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) failed = true; if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); ptdesc = NULL; failed = true; } if (ptdesc) { mm_inc_nr_pmds(mm); pmd = ptdesc_address(ptdesc); } pmds[i] = pmd; } if (failed) { free_pmds(mm, pmds, count); return -ENOMEM; } return 0; } /* * Mop up any pmd pages which may still be attached to the pgd. * Normally they will be freed by munmap/exit_mmap, but any pmd we * preallocate which never got a corresponding vma will need to be * freed manually. */ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) { pgd_t pgd = *pgdp; if (pgd_val(pgd) != 0) { pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); pgd_clear(pgdp); paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); } } static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; pgdp = kernel_to_user_pgdp(pgdp); for (i = 0; i < PREALLOCATED_USER_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); #endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { p4d_t *p4d; pud_t *pud; int i; p4d = p4d_offset(pgd, 0); pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, pud, pmd); } } #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); p4d_t *u_p4d; pud_t *u_pud; int i; u_p4d = p4d_offset(u_pgd, 0); u_pud = pud_offset(u_p4d, 0); s_pgd += KERNEL_PGD_BOUNDARY; u_pud += KERNEL_PGD_BOUNDARY; for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { pmd_t *pmd = pmds[i]; memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, u_pud, pmd); } } #else static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { } #endif static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { /* * PTI and Xen need a whole page for the PAE PGD * even though the hardware only needs 32 bytes. * * For simplicity, allocate a page for all users. */ return __pgd_alloc(mm, pgd_allocation_order()); } static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { __pgd_free(mm, pgd); } pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[PREALLOCATED_PMDS]; pgd = _pgd_alloc(mm); if (pgd == NULL) goto out; mm->pgd = pgd; if (sizeof(pmds) != 0 && preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; if (sizeof(u_pmds) != 0 && preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; if (paravirt_pgd_alloc(mm) != 0) goto out_free_user_pmds; /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ spin_lock(&pgd_lock); pgd_ctor(mm, pgd); if (sizeof(pmds) != 0) pgd_prepopulate_pmd(mm, pgd, pmds); if (sizeof(u_pmds) != 0) pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; out_free_user_pmds: if (sizeof(u_pmds) != 0) free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: if (sizeof(pmds) != 0) free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(mm, pgd); out: return NULL; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); _pgd_free(mm, pgd); } /* * Used to set accessed or dirty bits in the page table entries * on other architectures. On x86, the accessed and dirty bits * are tracked by hardware. However, do_wp_page calls this function * to also make the pte writeable at the same time the dirty bit is * set. In that case we do actually need to write the PTE. */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(*ptep, entry); if (changed && dirty) set_pte(ptep, entry); return changed; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { int changed = !pud_same(*pudp, entry); VM_BUG_ON(address & ~HPAGE_PUD_MASK); if (changed && dirty) { set_pud(pudp, entry); /* * We had a write-protection fault here and changed the pud * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } #endif int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { int ret = 0; if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *) &ptep->pte); return ret; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { int ret = 0; if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmdp); return ret; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { int ret = 0; if (pud_young(*pudp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pudp); return ret; } #endif int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* * On x86 CPUs, clearing the accessed bit without a TLB flush * doesn't cause data corruption. [ It could cause incorrect * page aging and the (mistaken) reclaim of hot pages, but the * chance of that should be relatively low. ] * * So as a performance optimization don't flush the TLB when * clearing the accessed bit, it will eventually be flushed by * a context switch or a VM operation anyway. [ In the rare * event of it not getting flushed for a long time the delay * shouldn't really matter because there's no real memory * pressure for swapout to react to. ] */ return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); /* * No flush is necessary. Once an invalid PTE is established, the PTE's * access and dirty bits cannot be updated. */ return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); } #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, pud_t *pudp) { VM_WARN_ON_ONCE(!pud_present(*pudp)); pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return old; } #endif /** * reserve_top_address - Reserve a hole in the top of the kernel address space * @reserve: Size of hole to reserve * * Can be used to relocate the fixmap area and poke a hole in the top * of the kernel address space to make room for a hypervisor. */ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } int fixmaps_set; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); #ifdef CONFIG_X86_64 /* * Ensure that the static initial page tables are covering the * fixmap completely. */ BUILD_BUG_ON(__end_of_permanent_fixed_addresses > (FIXMAP_PMD_NUM * PTRS_PER_PTE)); #endif if (idx >= __end_of_fixed_addresses) { BUG(); return; } set_pte_vaddr(address, pte); fixmaps_set++; } void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { /* Sanitize 'prot' against any unsupported bits: */ pgprot_val(flags) &= __default_kernel_pte_mask; __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #if CONFIG_PGTABLE_LEVELS > 4 /** * p4d_set_huge - Set up kernel P4D mapping * @p4d: Pointer to the P4D entry * @addr: Virtual address associated with the P4D entry * @prot: Protection bits to use * * No 512GB pages yet -- always return 0 */ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } /** * p4d_clear_huge - Clear kernel P4D mapping when it is set * @p4d: Pointer to the P4D entry to clear * * No 512GB pages yet -- do nothing */ void p4d_clear_huge(p4d_t *p4d) { } #endif /** * pud_set_huge - Set up kernel PUD mapping * @pud: Pointer to the PUD entry * @addr: Virtual address associated with the PUD entry * @prot: Protection bits to use * * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this * function sets up a huge page only if the complete range has the same MTRR * caching mode. * * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger * page mapping attempt fails. * * Returns 1 on success and 0 on failure. */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); if (!uniform) return 0; /* Bail out if we are we on a populated non-leaf entry: */ if (pud_present(*pud) && !pud_leaf(*pud)) return 0; set_pte((pte_t *)pud, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pmd_set_huge - Set up kernel PMD mapping * @pmd: Pointer to the PMD entry * @addr: Virtual address associated with the PMD entry * @prot: Protection bits to use * * See text over pud_set_huge() above. * * Returns 1 on success and 0 on failure. */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); if (!uniform) { pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", __func__, addr, addr + PMD_SIZE); return 0; } /* Bail out if we are we on a populated non-leaf entry: */ if (pmd_present(*pmd) && !pmd_leaf(*pmd)) return 0; set_pte((pte_t *)pmd, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pud_clear_huge - Clear kernel PUD mapping when it is set * @pud: Pointer to the PUD entry to clear. * * Returns 1 on success and 0 on failure (no PUD map is found). */ int pud_clear_huge(pud_t *pud) { if (pud_leaf(*pud)) { pud_clear(pud); return 1; } return 0; } /** * pmd_clear_huge - Clear kernel PMD mapping when it is set * @pmd: Pointer to the PMD entry to clear. * * Returns 1 on success and 0 on failure (no PMD map is found). */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_leaf(*pmd)) { pmd_clear(pmd); return 1; } return 0; } #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear PUD entry and free PMD page * @pud: Pointer to a PUD * @addr: Virtual address associated with PUD * * Context: The PUD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. * * NOTE: Callers must allow a single page allocation. */ int pud_free_pmd_page(pud_t *pud, unsigned long addr) { pmd_t *pmd, *pmd_sv; pte_t *pte; int i; pmd = pud_pgtable(*pud); pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); if (!pmd_sv) return 0; for (i = 0; i < PTRS_PER_PMD; i++) { pmd_sv[i] = pmd[i]; if (!pmd_none(pmd[i])) pmd_clear(&pmd[i]); } pud_clear(pud); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); pte_free_kernel(&init_mm, pte); } } free_page((unsigned long)pmd_sv); pmd_free(&init_mm, pmd); return 1; } /** * pmd_free_pte_page - Clear PMD entry and free PTE page. * @pmd: Pointer to the PMD * @addr: Virtual address associated with PMD * * Context: The PMD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { pte_t *pte; pte = (pte_t *)pmd_page_vaddr(*pmd); pmd_clear(pmd); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); pte_free_kernel(&init_mm, pte); return 1; } #else /* !CONFIG_X86_64 */ /* * Disable free page handling on x86-PAE. This assures that ioremap() * does not update sync'd PMD entries. See vmalloc_sync_one(). */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return pmd_none(*pmd); } #endif /* CONFIG_X86_64 */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pte_mkwrite_shstk(pte); pte = pte_mkwrite_novma(pte); return pte_clear_saveddirty(pte); } pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pmd_mkwrite_shstk(pmd); pmd = pmd_mkwrite_novma(pmd); return pmd_clear_saveddirty(pmd); } void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { /* * Hardware before shadow stack can (rarely) set Dirty=1 * on a Write=0 PTE. So the below condition * only indicates a software bug when shadow stack is * supported by the HW. This checking is covered in * pte_shstk(). */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pte_shstk(pte)); } void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pmd_shstk(pmd)); } void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); } |
| 13 4 11 11 9 10 7 10 10 10 10 5 5 9 10 17 9 14 12 10 9 13 17 56 13 17 5 17 46 10 14 14 13 10 7 7 2 10 7 17 9 17 14 14 1 17 9 5 2 5 5 3 34 26 5 1 26 46 26 46 34 33 34 46 37 15 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/bitmap.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/export.h> #include <linux/hex.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include "kstrtox.h" /** * bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap * * @ubuf: pointer to user buffer containing string. * @ulen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. */ int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { char *buf; int ret; buf = memdup_user_nul(ubuf, ulen); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_parse(buf, UINT_MAX, maskp, nmaskbits); kfree(buf); return ret; } EXPORT_SYMBOL(bitmap_parse_user); /** * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string * @list: indicates whether the bitmap must be list * @buf: page aligned buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * * Output format is a comma-separated list of decimal numbers and * ranges if list is specified or hex digits grouped into comma-separated * sets of 8 digits/set. Returns the number of characters written to buf. * * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned * area and that sufficient storage remains at @buf to accommodate the * bitmap_print_to_pagebuf() output. Returns the number of characters * actually printed to @buf, excluding terminating '\0'. */ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits) { ptrdiff_t len = PAGE_SIZE - offset_in_page(buf); return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) : scnprintf(buf, len, "%*pb\n", nmaskbits, maskp); } EXPORT_SYMBOL(bitmap_print_to_pagebuf); /** * bitmap_print_to_buf - convert bitmap to list or hex format ASCII string * @list: indicates whether the bitmap must be list * true: print in decimal list format * false: print in hexadecimal bitmask format * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print */ static int bitmap_print_to_buf(bool list, char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { const char *fmt = list ? "%*pbl\n" : "%*pb\n"; ssize_t size; void *data; data = kasprintf(GFP_KERNEL, fmt, nmaskbits, maskp); if (!data) return -ENOMEM; size = memory_read_from_buffer(buf, count, &off, data, strlen(data) + 1); kfree(data); return size; } /** * bitmap_print_bitmask_to_buf - convert bitmap to hex bitmask format ASCII string * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print * * The bitmap_print_to_pagebuf() is used indirectly via its cpumap wrapper * cpumap_print_to_pagebuf() or directly by drivers to export hexadecimal * bitmask and decimal list to userspace by sysfs ABI. * Drivers might be using a normal attribute for this kind of ABIs. A * normal attribute typically has show entry as below:: * * static ssize_t example_attribute_show(struct device *dev, * struct device_attribute *attr, char *buf) * { * ... * return bitmap_print_to_pagebuf(true, buf, &mask, nr_trig_max); * } * * show entry of attribute has no offset and count parameters and this * means the file is limited to one page only. * bitmap_print_to_pagebuf() API works terribly well for this kind of * normal attribute with buf parameter and without offset, count:: * * bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, * int nmaskbits) * { * } * * The problem is once we have a large bitmap, we have a chance to get a * bitmask or list more than one page. Especially for list, it could be * as complex as 0,3,5,7,9,... We have no simple way to know it exact size. * It turns out bin_attribute is a way to break this limit. bin_attribute * has show entry as below:: * * static ssize_t * example_bin_attribute_show(struct file *filp, struct kobject *kobj, * struct bin_attribute *attr, char *buf, * loff_t offset, size_t count) * { * ... * } * * With the new offset and count parameters, this makes sysfs ABI be able * to support file size more than one page. For example, offset could be * >= 4096. * bitmap_print_bitmask_to_buf(), bitmap_print_list_to_buf() wit their * cpumap wrapper cpumap_print_bitmask_to_buf(), cpumap_print_list_to_buf() * make those drivers be able to support large bitmask and list after they * move to use bin_attribute. In result, we have to pass the corresponding * parameters such as off, count from bin_attribute show entry to this API. * * The role of cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf() * is similar with cpumap_print_to_pagebuf(), the difference is that * bitmap_print_to_pagebuf() mainly serves sysfs attribute with the assumption * the destination buffer is exactly one page and won't be more than one page. * cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf(), on the other * hand, mainly serves bin_attribute which doesn't work with exact one page, * and it can break the size limit of converted decimal list and hexadecimal * bitmask. * * WARNING! * * This function is not a replacement for sprintf() or bitmap_print_to_pagebuf(). * It is intended to workaround sysfs limitations discussed above and should be * used carefully in general case for the following reasons: * * - Time complexity is O(nbits^2/count), comparing to O(nbits) for snprintf(). * - Memory complexity is O(nbits), comparing to O(1) for snprintf(). * - @off and @count are NOT offset and number of bits to print. * - If printing part of bitmap as list, the resulting string is not a correct * list representation of bitmap. Particularly, some bits within or out of * related interval may be erroneously set or unset. The format of the string * may be broken, so bitmap_parselist-like parser may fail parsing it. * - If printing the whole bitmap as list by parts, user must ensure the order * of calls of the function such that the offset is incremented linearly. * - If printing the whole bitmap as list by parts, user must keep bitmap * unchanged between the very first and very last call. Otherwise concatenated * result may be incorrect, and format may be broken. * * Returns the number of characters actually printed to @buf */ int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { return bitmap_print_to_buf(false, buf, maskp, nmaskbits, off, count); } EXPORT_SYMBOL(bitmap_print_bitmask_to_buf); /** * bitmap_print_list_to_buf - convert bitmap to decimal list format ASCII string * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print * * Everything is same with the above bitmap_print_bitmask_to_buf() except * the print format. */ int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { return bitmap_print_to_buf(true, buf, maskp, nmaskbits, off, count); } EXPORT_SYMBOL(bitmap_print_list_to_buf); /* * Region 9-38:4/10 describes the following bitmap structure: * 0 9 12 18 38 N * .........****......****......****.................. * ^ ^ ^ ^ ^ * start off group_len end nbits */ struct region { unsigned int start; unsigned int off; unsigned int group_len; unsigned int end; unsigned int nbits; }; static void bitmap_set_region(const struct region *r, unsigned long *bitmap) { unsigned int start; for (start = r->start; start <= r->end; start += r->group_len) bitmap_set(bitmap, start, min(r->end - start + 1, r->off)); } static int bitmap_check_region(const struct region *r) { if (r->start > r->end || r->group_len == 0 || r->off > r->group_len) return -EINVAL; if (r->end >= r->nbits) return -ERANGE; return 0; } static const char *bitmap_getnum(const char *str, unsigned int *num, unsigned int lastbit) { unsigned long long n; unsigned int len; if (str[0] == 'N') { *num = lastbit; return str + 1; } len = _parse_integer(str, 10, &n); if (!len) return ERR_PTR(-EINVAL); if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n) return ERR_PTR(-EOVERFLOW); *num = n; return str + len; } static inline bool end_of_str(char c) { return c == '\0' || c == '\n'; } static inline bool __end_of_region(char c) { return isspace(c) || c == ','; } static inline bool end_of_region(char c) { return __end_of_region(c) || end_of_str(c); } /* * The format allows commas and whitespaces at the beginning * of the region. */ static const char *bitmap_find_region(const char *str) { while (__end_of_region(*str)) str++; return end_of_str(*str) ? NULL : str; } static const char *bitmap_find_region_reverse(const char *start, const char *end) { while (start <= end && __end_of_region(*end)) end--; return end; } static const char *bitmap_parse_region(const char *str, struct region *r) { unsigned int lastbit = r->nbits - 1; if (!strncasecmp(str, "all", 3)) { r->start = 0; r->end = lastbit; str += 3; goto check_pattern; } str = bitmap_getnum(str, &r->start, lastbit); if (IS_ERR(str)) return str; if (end_of_region(*str)) goto no_end; if (*str != '-') return ERR_PTR(-EINVAL); str = bitmap_getnum(str + 1, &r->end, lastbit); if (IS_ERR(str)) return str; check_pattern: if (end_of_region(*str)) goto no_pattern; if (*str != ':') return ERR_PTR(-EINVAL); str = bitmap_getnum(str + 1, &r->off, lastbit); if (IS_ERR(str)) return str; if (*str != '/') return ERR_PTR(-EINVAL); return bitmap_getnum(str + 1, &r->group_len, lastbit); no_end: r->end = r->start; no_pattern: r->off = r->end + 1; r->group_len = r->end + 1; return end_of_str(*str) ? NULL : str; } /** * bitmap_parselist - convert list format ASCII string to bitmap * @buf: read user string from this buffer; must be terminated * with a \0 or \n. * @maskp: write resulting mask here * @nmaskbits: number of bits in mask to be written * * Input format is a comma-separated list of decimal numbers and * ranges. Consecutively set bits are shown as two hyphen-separated * decimal numbers, the smallest and largest bit numbers set in * the range. * Optionally each range can be postfixed to denote that only parts of it * should be set. The range will divided to groups of specific size. * From each group will be used only defined amount of bits. * Syntax: range:used_size/group_size * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769 * The value 'N' can be used as a dynamically substituted token for the * maximum allowed value; i.e (nmaskbits - 1). Keep in mind that it is * dynamic, so if system changes cause the bitmap width to change, such * as more cores in a CPU list, then any ranges using N will also change. * * Returns: 0 on success, -errno on invalid input strings. Error values: * * - ``-EINVAL``: wrong region format * - ``-EINVAL``: invalid character in string * - ``-ERANGE``: bit number specified too large for mask * - ``-EOVERFLOW``: integer overflow in the input parameters */ int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits) { struct region r; long ret; r.nbits = nmaskbits; bitmap_zero(maskp, r.nbits); while (buf) { buf = bitmap_find_region(buf); if (buf == NULL) return 0; buf = bitmap_parse_region(buf, &r); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_check_region(&r); if (ret) return ret; bitmap_set_region(&r, maskp); } return 0; } EXPORT_SYMBOL(bitmap_parselist); /** * bitmap_parselist_user() - convert user buffer's list format ASCII * string to bitmap * * @ubuf: pointer to user buffer containing string. * @ulen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. * * Wrapper for bitmap_parselist(), providing it with user buffer. */ int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { char *buf; int ret; buf = memdup_user_nul(ubuf, ulen); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_parselist(buf, maskp, nmaskbits); kfree(buf); return ret; } EXPORT_SYMBOL(bitmap_parselist_user); static const char *bitmap_get_x32_reverse(const char *start, const char *end, u32 *num) { u32 ret = 0; int c, i; for (i = 0; i < 32; i += 4) { c = hex_to_bin(*end--); if (c < 0) return ERR_PTR(-EINVAL); ret |= c << i; if (start > end || __end_of_region(*end)) goto out; } if (hex_to_bin(*end--) >= 0) return ERR_PTR(-EOVERFLOW); out: *num = ret; return end; } /** * bitmap_parse - convert an ASCII hex string into a bitmap. * @start: pointer to buffer containing string. * @buflen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0 or \n. In that case, * UINT_MAX may be provided instead of string length. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. * * Commas group hex digits into chunks. Each chunk defines exactly 32 * bits of the resultant bitmask. No chunk may specify a value larger * than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value * then leading 0-bits are prepended. %-EINVAL is returned for illegal * characters. Grouping such as "1,,5", ",44", "," or "" is allowed. * Leading, embedded and trailing whitespace accepted. */ int bitmap_parse(const char *start, unsigned int buflen, unsigned long *maskp, int nmaskbits) { const char *end = strnchrnul(start, buflen, '\n') - 1; int chunks = BITS_TO_U32(nmaskbits); u32 *bitmap = (u32 *)maskp; int unset_bit; int chunk; for (chunk = 0; ; chunk++) { end = bitmap_find_region_reverse(start, end); if (start > end) break; if (!chunks--) return -EOVERFLOW; #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) end = bitmap_get_x32_reverse(start, end, &bitmap[chunk ^ 1]); #else end = bitmap_get_x32_reverse(start, end, &bitmap[chunk]); #endif if (IS_ERR(end)) return PTR_ERR(end); } unset_bit = (BITS_TO_U32(nmaskbits) - chunks) * 32; if (unset_bit < nmaskbits) { bitmap_clear(maskp, unset_bit, nmaskbits - unset_bit); return 0; } if (find_next_bit(maskp, unset_bit, nmaskbits) != unset_bit) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(bitmap_parse); |
| 5 5 15 5 15 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fs-verity: read-only file-based authenticity protection * * This header declares the interface between the fs/verity/ support layer and * filesystems that support fs-verity. * * Copyright 2019 Google LLC */ #ifndef _LINUX_FSVERITY_H #define _LINUX_FSVERITY_H #include <linux/fs.h> #include <linux/mm.h> #include <crypto/hash_info.h> #include <crypto/sha2.h> #include <uapi/linux/fsverity.h> /* * Largest digest size among all hash algorithms supported by fs-verity. * Currently assumed to be <= size of fsverity_descriptor::root_hash. */ #define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 struct fsverity_info; /* Verity operations for filesystems */ struct fsverity_operations { /** * The offset of the pointer to struct fsverity_info in the * filesystem-specific part of the inode, relative to the beginning of * the common part of the inode (the 'struct inode'). */ ptrdiff_t inode_info_offs; /** * Begin enabling verity on the given file. * * @filp: a readonly file descriptor for the file * * The filesystem must do any needed filesystem-specific preparations * for enabling verity, e.g. evicting inline data. It also must return * -EBUSY if verity is already being enabled on the given file. * * i_rwsem is held for write. * * Return: 0 on success, -errno on failure */ int (*begin_enable_verity)(struct file *filp); /** * End enabling verity on the given file. * * @filp: a readonly file descriptor for the file * @desc: the verity descriptor to write, or NULL on failure * @desc_size: size of verity descriptor, or 0 on failure * @merkle_tree_size: total bytes the Merkle tree took up * * If desc == NULL, then enabling verity failed and the filesystem only * must do any necessary cleanups. Else, it must also store the given * verity descriptor to a fs-specific location associated with the inode * and do any fs-specific actions needed to mark the inode as a verity * inode, e.g. setting a bit in the on-disk inode. The filesystem is * also responsible for setting the S_VERITY flag in the VFS inode. * * i_rwsem is held for write, but it may have been dropped between * ->begin_enable_verity() and ->end_enable_verity(). * * Return: 0 on success, -errno on failure */ int (*end_enable_verity)(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size); /** * Get the verity descriptor of the given inode. * * @inode: an inode with the S_VERITY flag set * @buf: buffer in which to place the verity descriptor * @bufsize: size of @buf, or 0 to retrieve the size only * * If bufsize == 0, then the size of the verity descriptor is returned. * Otherwise the verity descriptor is written to 'buf' and its actual * size is returned; -ERANGE is returned if it's too large. This may be * called by multiple processes concurrently on the same inode. * * Return: the size on success, -errno on failure */ int (*get_verity_descriptor)(struct inode *inode, void *buf, size_t bufsize); /** * Read a Merkle tree page of the given inode. * * @inode: the inode * @index: 0-based index of the page within the Merkle tree * @num_ra_pages: The number of Merkle tree pages that should be * prefetched starting at @index if the page at @index * isn't already cached. Implementations may ignore this * argument; it's only a performance optimization. * * This can be called at any time on an open verity file. It may be * called by multiple processes concurrently, even with the same page. * * Note that this must retrieve a *page*, not necessarily a *block*. * * Return: the page on success, ERR_PTR() on failure */ struct page *(*read_merkle_tree_page)(struct inode *inode, pgoff_t index, unsigned long num_ra_pages); /** * Write a Merkle tree block to the given inode. * * @inode: the inode for which the Merkle tree is being built * @buf: the Merkle tree block to write * @pos: the position of the block in the Merkle tree (in bytes) * @size: the Merkle tree block size (in bytes) * * This is only called between ->begin_enable_verity() and * ->end_enable_verity(). * * Return: 0 on success, -errno on failure */ int (*write_merkle_tree_block)(struct inode *inode, const void *buf, u64 pos, unsigned int size); }; #ifdef CONFIG_FS_VERITY /* * Returns the address of the verity info pointer within the filesystem-specific * part of the inode. (To save memory on filesystems that don't support * fsverity, a field in 'struct inode' itself is no longer used.) */ static inline struct fsverity_info ** fsverity_info_addr(const struct inode *inode) { VFS_WARN_ON_ONCE(inode->i_sb->s_vop->inode_info_offs == 0); return (void *)inode + inode->i_sb->s_vop->inode_info_offs; } static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* * Since this function can be called on inodes belonging to filesystems * that don't support fsverity at all, and fsverity_info_addr() doesn't * work on such filesystems, we have to start with an IS_VERITY() check. * Checking IS_VERITY() here is also useful to minimize the overhead of * fsverity_active() on non-verity files. */ if (!IS_VERITY(inode)) return NULL; /* * Pairs with the cmpxchg_release() in fsverity_set_info(). I.e., * another task may publish the inode's verity info concurrently, * executing a RELEASE barrier. Use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ return smp_load_acquire(fsverity_info_addr(inode)); } /* enable.c */ int fsverity_ioctl_enable(struct file *filp, const void __user *arg); /* measure.c */ int fsverity_ioctl_measure(struct file *filp, void __user *arg); int fsverity_get_digest(struct inode *inode, u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], u8 *alg, enum hash_algo *halg); /* open.c */ int __fsverity_file_open(struct inode *inode, struct file *filp); int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); void __fsverity_cleanup_inode(struct inode *inode); /** * fsverity_cleanup_inode() - free the inode's verity info, if present * @inode: an inode being evicted * * Filesystems must call this on inode eviction to free the inode's verity info. */ static inline void fsverity_cleanup_inode(struct inode *inode) { /* * Only IS_VERITY() inodes can have verity info, so start by checking * for IS_VERITY() (which is faster than retrieving the pointer to the * verity info). This minimizes overhead for non-verity inodes. */ if (IS_VERITY(inode)) __fsverity_cleanup_inode(inode); else VFS_WARN_ON_ONCE(*fsverity_info_addr(inode) != NULL); } /* read_metadata.c */ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg); /* verify.c */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset); void fsverity_verify_bio(struct bio *bio); void fsverity_enqueue_verify_work(struct work_struct *work); #else /* !CONFIG_FS_VERITY */ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { return NULL; } /* enable.c */ static inline int fsverity_ioctl_enable(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } /* measure.c */ static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fsverity_get_digest(struct inode *inode, u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], u8 *alg, enum hash_algo *halg) { /* * fsverity is not enabled in the kernel configuration, so always report * that the file doesn't have fsverity enabled (digest size 0). */ return 0; } /* open.c */ static inline int __fsverity_file_open(struct inode *inode, struct file *filp) { return -EOPNOTSUPP; } static inline int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static inline void fsverity_cleanup_inode(struct inode *inode) { } /* read_metadata.c */ static inline int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) { return -EOPNOTSUPP; } /* verify.c */ static inline bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { WARN_ON_ONCE(1); return false; } static inline void fsverity_verify_bio(struct bio *bio) { WARN_ON_ONCE(1); } static inline void fsverity_enqueue_verify_work(struct work_struct *work) { WARN_ON_ONCE(1); } #endif /* !CONFIG_FS_VERITY */ static inline bool fsverity_verify_folio(struct folio *folio) { return fsverity_verify_blocks(folio, folio_size(folio), 0); } static inline bool fsverity_verify_page(struct page *page) { return fsverity_verify_blocks(page_folio(page), PAGE_SIZE, 0); } /** * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * * This checks whether the inode's verity info has been set. * * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before the verity info.) * * Return: true if reads need to go through fs-verity, otherwise false */ static inline bool fsverity_active(const struct inode *inode) { return fsverity_get_info(inode) != NULL; } /** * fsverity_file_open() - prepare to open a verity file * @inode: the inode being opened * @filp: the struct file being set up * * When opening a verity file, deny the open if it is for writing. Otherwise, * set up the inode's verity info if not already done. * * When combined with fscrypt, this must be called after fscrypt_file_open(). * Otherwise, we won't have the key set up to decrypt the verity metadata. * * Return: 0 on success, -errno on failure */ static inline int fsverity_file_open(struct inode *inode, struct file *filp) { if (IS_VERITY(inode)) return __fsverity_file_open(inode, filp); return 0; } /** * fsverity_prepare_setattr() - prepare to change a verity inode's attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * * Verity files are immutable, so deny truncates. This isn't covered by the * open-time check because sys_truncate() takes a path, not a file descriptor. * * Return: 0 on success, -errno on failure */ static inline int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (IS_VERITY(d_inode(dentry))) return __fsverity_prepare_setattr(dentry, attr); return 0; } #endif /* _LINUX_FSVERITY_H */ |
| 17 17 17 17 17 17 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 | // SPDX-License-Identifier: GPL-2.0-only /* * HT handling * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright 2017 Intel Deutschland GmbH * Copyright(c) 2020-2025 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "rate.h" static void __check_htcap_disable(struct ieee80211_ht_cap *ht_capa, struct ieee80211_ht_cap *ht_capa_mask, struct ieee80211_sta_ht_cap *ht_cap, u16 flag) { __le16 le_flag = cpu_to_le16(flag); if (ht_capa_mask->cap_info & le_flag) { if (!(ht_capa->cap_info & le_flag)) ht_cap->cap &= ~flag; } } static void __check_htcap_enable(struct ieee80211_ht_cap *ht_capa, struct ieee80211_ht_cap *ht_capa_mask, struct ieee80211_sta_ht_cap *ht_cap, u16 flag) { __le16 le_flag = cpu_to_le16(flag); if ((ht_capa_mask->cap_info & le_flag) && (ht_capa->cap_info & le_flag)) ht_cap->cap |= flag; } void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap *ht_cap) { struct ieee80211_ht_cap *ht_capa, *ht_capa_mask; u8 *scaps, *smask; int i; if (!ht_cap->ht_supported) return; switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: ht_capa = &sdata->u.mgd.ht_capa; ht_capa_mask = &sdata->u.mgd.ht_capa_mask; break; case NL80211_IFTYPE_ADHOC: ht_capa = &sdata->u.ibss.ht_capa; ht_capa_mask = &sdata->u.ibss.ht_capa_mask; break; default: WARN_ON_ONCE(1); return; } scaps = (u8 *)(&ht_capa->mcs.rx_mask); smask = (u8 *)(&ht_capa_mask->mcs.rx_mask); /* NOTE: If you add more over-rides here, update register_hw * ht_capa_mod_mask logic in main.c as well. * And, if this method can ever change ht_cap.ht_supported, fix * the check in ieee80211_add_ht_ie. */ /* check for HT over-rides, MCS rates first. */ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) { u8 m = smask[i]; ht_cap->mcs.rx_mask[i] &= ~m; /* turn off all masked bits */ /* Add back rates that are supported */ ht_cap->mcs.rx_mask[i] |= (m & scaps[i]); } /* Force removal of HT-40 capabilities? */ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_SUP_WIDTH_20_40); __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_SGI_40); /* Allow user to disable SGI-20 (SGI-40 is handled above) */ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_SGI_20); /* Allow user to disable the max-AMSDU bit. */ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_MAX_AMSDU); /* Allow user to disable LDPC */ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_LDPC_CODING); /* Allow user to enable 40 MHz intolerant bit. */ __check_htcap_enable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_40MHZ_INTOLERANT); /* Allow user to enable TX STBC bit */ __check_htcap_enable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_TX_STBC); /* Allow user to configure RX STBC bits */ if (ht_capa_mask->cap_info & cpu_to_le16(IEEE80211_HT_CAP_RX_STBC)) ht_cap->cap |= le16_to_cpu(ht_capa->cap_info) & IEEE80211_HT_CAP_RX_STBC; /* Allow user to decrease AMPDU factor */ if (ht_capa_mask->ampdu_params_info & IEEE80211_HT_AMPDU_PARM_FACTOR) { u8 n = ht_capa->ampdu_params_info & IEEE80211_HT_AMPDU_PARM_FACTOR; if (n < ht_cap->ampdu_factor) ht_cap->ampdu_factor = n; } /* Allow the user to increase AMPDU density. */ if (ht_capa_mask->ampdu_params_info & IEEE80211_HT_AMPDU_PARM_DENSITY) { u8 n = (ht_capa->ampdu_params_info & IEEE80211_HT_AMPDU_PARM_DENSITY) >> IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT; if (n > ht_cap->ampdu_density) ht_cap->ampdu_density = n; } } bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_ht_cap *ht_cap_ie, struct link_sta_info *link_sta) { struct ieee80211_bss_conf *link_conf; struct sta_info *sta = link_sta->sta; struct ieee80211_sta_ht_cap ht_cap, own_cap; u8 ampdu_info, tx_mcs_set_cap; int i, max_tx_streams; bool changed; enum ieee80211_sta_rx_bandwidth bw; enum nl80211_chan_width width; memset(&ht_cap, 0, sizeof(ht_cap)); if (!ht_cap_ie || !sband->ht_cap.ht_supported) goto apply; ht_cap.ht_supported = true; own_cap = sband->ht_cap; /* * If user has specified capability over-rides, take care * of that if the station we're setting up is the AP or TDLS peer that * we advertised a restricted capability set to. Override * our own capabilities and then use those below. */ if (sdata->vif.type == NL80211_IFTYPE_STATION || sdata->vif.type == NL80211_IFTYPE_ADHOC) ieee80211_apply_htcap_overrides(sdata, &own_cap); /* * The bits listed in this expression should be * the same for the peer and us, if the station * advertises more then we can't use those thus * we mask them out. */ ht_cap.cap = le16_to_cpu(ht_cap_ie->cap_info) & (own_cap.cap | ~(IEEE80211_HT_CAP_LDPC_CODING | IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40)); /* * The STBC bits are asymmetric -- if we don't have * TX then mask out the peer's RX and vice versa. */ if (!(own_cap.cap & IEEE80211_HT_CAP_TX_STBC)) ht_cap.cap &= ~IEEE80211_HT_CAP_RX_STBC; if (!(own_cap.cap & IEEE80211_HT_CAP_RX_STBC)) ht_cap.cap &= ~IEEE80211_HT_CAP_TX_STBC; ampdu_info = ht_cap_ie->ampdu_params_info; ht_cap.ampdu_factor = ampdu_info & IEEE80211_HT_AMPDU_PARM_FACTOR; ht_cap.ampdu_density = (ampdu_info & IEEE80211_HT_AMPDU_PARM_DENSITY) >> 2; /* own MCS TX capabilities */ tx_mcs_set_cap = own_cap.mcs.tx_params; /* Copy peer MCS TX capabilities, the driver might need them. */ ht_cap.mcs.tx_params = ht_cap_ie->mcs.tx_params; /* can we TX with MCS rates? */ if (!(tx_mcs_set_cap & IEEE80211_HT_MCS_TX_DEFINED)) goto apply; /* Counting from 0, therefore +1 */ if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_RX_DIFF) max_tx_streams = ((tx_mcs_set_cap & IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK) >> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1; else max_tx_streams = IEEE80211_HT_MCS_TX_MAX_STREAMS; /* * 802.11n-2009 20.3.5 / 20.6 says: * - indices 0 to 7 and 32 are single spatial stream * - 8 to 31 are multiple spatial streams using equal modulation * [8..15 for two streams, 16..23 for three and 24..31 for four] * - remainder are multiple spatial streams using unequal modulation */ for (i = 0; i < max_tx_streams; i++) ht_cap.mcs.rx_mask[i] = own_cap.mcs.rx_mask[i] & ht_cap_ie->mcs.rx_mask[i]; if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION) for (i = IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE; i < IEEE80211_HT_MCS_MASK_LEN; i++) ht_cap.mcs.rx_mask[i] = own_cap.mcs.rx_mask[i] & ht_cap_ie->mcs.rx_mask[i]; /* handle MCS rate 32 too */ if (own_cap.mcs.rx_mask[32/8] & ht_cap_ie->mcs.rx_mask[32/8] & 1) ht_cap.mcs.rx_mask[32/8] |= 1; /* set Rx highest rate */ ht_cap.mcs.rx_highest = ht_cap_ie->mcs.rx_highest; if (ht_cap.cap & IEEE80211_HT_CAP_MAX_AMSDU) link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_7935; else link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_3839; ieee80211_sta_recalc_aggregates(&sta->sta); apply: changed = memcmp(&link_sta->pub->ht_cap, &ht_cap, sizeof(ht_cap)); memcpy(&link_sta->pub->ht_cap, &ht_cap, sizeof(ht_cap)); rcu_read_lock(); link_conf = rcu_dereference(sdata->vif.link_conf[link_sta->link_id]); if (WARN_ON(!link_conf)) width = NL80211_CHAN_WIDTH_20_NOHT; else width = link_conf->chanreq.oper.width; switch (width) { default: WARN_ON_ONCE(1); fallthrough; case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: bw = IEEE80211_STA_RX_BW_20; break; case NL80211_CHAN_WIDTH_40: case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_320: bw = ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; break; } rcu_read_unlock(); link_sta->pub->bandwidth = bw; link_sta->cur_max_bandwidth = ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { enum ieee80211_smps_mode smps_mode; switch ((ht_cap.cap & IEEE80211_HT_CAP_SM_PS) >> IEEE80211_HT_CAP_SM_PS_SHIFT) { case WLAN_HT_CAP_SM_PS_INVALID: case WLAN_HT_CAP_SM_PS_STATIC: smps_mode = IEEE80211_SMPS_STATIC; break; case WLAN_HT_CAP_SM_PS_DYNAMIC: smps_mode = IEEE80211_SMPS_DYNAMIC; break; case WLAN_HT_CAP_SM_PS_DISABLED: smps_mode = IEEE80211_SMPS_OFF; break; } if (smps_mode != link_sta->pub->smps_mode) changed = true; link_sta->pub->smps_mode = smps_mode; } else { link_sta->pub->smps_mode = IEEE80211_SMPS_OFF; } return changed; } void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason) { int i; lockdep_assert_wiphy(sta->local->hw.wiphy); for (i = 0; i < IEEE80211_NUM_TIDS; i++) __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, WLAN_REASON_QSTA_LEAVE_QBSS, reason != AGG_STOP_DESTROY_STA && reason != AGG_STOP_PEER_REQUEST); for (i = 0; i < IEEE80211_NUM_TIDS; i++) __ieee80211_stop_tx_ba_session(sta, i, reason); /* * In case the tear down is part of a reconfigure due to HW restart * request, it is possible that the low level driver requested to stop * the BA session, so handle it to properly clean tid_tx data. */ if(reason == AGG_STOP_DESTROY_STA) { wiphy_work_cancel(sta->local->hw.wiphy, &sta->ampdu_mlme.work); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { struct tid_ampdu_tx *tid_tx = rcu_dereference_protected_tid_tx(sta, i); if (!tid_tx) continue; if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) ieee80211_stop_tx_ba_cb(sta, i, tid_tx); } } } void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work) { struct sta_info *sta = container_of(work, struct sta_info, ampdu_mlme.work); struct tid_ampdu_tx *tid_tx; bool blocked; int tid; lockdep_assert_wiphy(sta->local->hw.wiphy); /* When this flag is set, new sessions should be blocked. */ blocked = test_sta_flag(sta, WLAN_STA_BLOCK_BA); for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired)) __ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, WLAN_REASON_QSTA_TIMEOUT, true); if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_stop_requested)) __ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, WLAN_REASON_UNSPECIFIED, true); if (!blocked && test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl)) __ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, IEEE80211_MAX_AMPDU_BUF_HT, false, true, 0); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, sta->ampdu_mlme.tid_rx_manage_offl)) __ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, 0, false); spin_lock_bh(&sta->lock); tid_tx = sta->ampdu_mlme.tid_start_tx[tid]; if (!blocked && tid_tx) { struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); struct ieee80211_sub_if_data *sdata = vif_to_sdata(txqi->txq.vif); struct fq *fq = &sdata->local->fq; spin_lock_bh(&fq->lock); /* Allow only frags to be dequeued */ set_bit(IEEE80211_TXQ_STOP, &txqi->flags); if (!skb_queue_empty(&txqi->frags)) { /* Fragmented Tx is ongoing, wait for it to * finish. Reschedule worker to retry later. */ spin_unlock_bh(&fq->lock); spin_unlock_bh(&sta->lock); /* Give the task working on the txq a chance * to send out the queued frags */ synchronize_net(); wiphy_work_queue(sdata->local->hw.wiphy, work); return; } spin_unlock_bh(&fq->lock); /* * Assign it over to the normal tid_tx array * where it "goes live". */ sta->ampdu_mlme.tid_start_tx[tid] = NULL; /* could there be a race? */ if (sta->ampdu_mlme.tid_tx[tid]) kfree(tid_tx); else ieee80211_assign_tid_tx(sta, tid, tid_tx); spin_unlock_bh(&sta->lock); ieee80211_tx_ba_session_handle_start(sta, tid); continue; } spin_unlock_bh(&sta->lock); tid_tx = rcu_dereference_protected_tid_tx(sta, tid); if (!tid_tx) continue; if (!blocked && test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state)) ieee80211_start_tx_ba_cb(sta, tid, tid_tx); if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state)) __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_LOCAL_REQUEST); if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) ieee80211_stop_tx_ba_cb(sta, tid, tid_tx); } } void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u16 initiator, u16 reason_code) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u16 params; skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = ieee80211_mgmt_ba(skb, da, sdata); skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba)); mgmt->u.action.category = WLAN_CATEGORY_BACK; mgmt->u.action.u.delba.action_code = WLAN_ACTION_DELBA; params = (u16)(initiator << 11); /* bit 11 initiator */ params |= (u16)(tid << 12); /* bit 15:12 TID number */ mgmt->u.action.u.delba.params = cpu_to_le16(params); mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code); ieee80211_tx_skb(sdata, skb); } void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len) { u16 tid, params; u16 initiator; params = le16_to_cpu(mgmt->u.action.u.delba.params); tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12; initiator = (params & IEEE80211_DELBA_PARAM_INITIATOR_MASK) >> 11; ht_dbg_ratelimited(sdata, "delba from %pM (%s) tid %d reason code %d\n", mgmt->sa, initiator ? "initiator" : "recipient", tid, le16_to_cpu(mgmt->u.action.u.delba.reason_code)); if (initiator == WLAN_BACK_INITIATOR) __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0, true); else __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_PEER_REQUEST); } enum nl80211_smps_mode ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps) { switch (smps) { case IEEE80211_SMPS_OFF: return NL80211_SMPS_OFF; case IEEE80211_SMPS_STATIC: return NL80211_SMPS_STATIC; case IEEE80211_SMPS_DYNAMIC: return NL80211_SMPS_DYNAMIC; default: return NL80211_SMPS_OFF; } } int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid, int link_id) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *action_frame; struct ieee80211_tx_info *info; u8 status_link_id = link_id < 0 ? 0 : link_id; /* 27 = header + category + action + smps mode */ skb = dev_alloc_skb(27 + local->hw.extra_tx_headroom); if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); action_frame = skb_put(skb, 27); memcpy(action_frame->da, da, ETH_ALEN); memcpy(action_frame->sa, sdata->dev->dev_addr, ETH_ALEN); memcpy(action_frame->bssid, bssid, ETH_ALEN); action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); action_frame->u.action.category = WLAN_CATEGORY_HT; action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS; switch (smps) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); smps = IEEE80211_SMPS_OFF; fallthrough; case IEEE80211_SMPS_OFF: action_frame->u.action.u.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DISABLED; break; case IEEE80211_SMPS_STATIC: action_frame->u.action.u.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_STATIC; break; case IEEE80211_SMPS_DYNAMIC: action_frame->u.action.u.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DYNAMIC; break; } /* we'll do more on status of this frame */ info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; /* we have 13 bits, and need 6: link_id 4, smps 2 */ info->status_data = IEEE80211_STATUS_TYPE_SMPS | u16_encode_bits(status_link_id << 2 | smps, IEEE80211_STATUS_SUBDATA_MASK); ieee80211_tx_skb_tid(sdata, skb, 7, link_id); return 0; } void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id, enum ieee80211_smps_mode smps_mode) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; if (WARN_ON_ONCE(vif->type != NL80211_IFTYPE_STATION)) return; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) goto out; trace_api_request_smps(sdata->local, sdata, link, smps_mode); if (link->u.mgd.driver_smps_mode == smps_mode) goto out; link->u.mgd.driver_smps_mode = smps_mode; wiphy_work_queue(sdata->local->hw.wiphy, &link->u.mgd.request_smps_work); out: rcu_read_unlock(); } /* this might change ... don't want non-open drivers using it */ EXPORT_SYMBOL_GPL(ieee80211_request_smps); void ieee80211_ht_handle_chanwidth_notif(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct link_sta_info *link_sta, u8 chanwidth, enum nl80211_band band) { enum ieee80211_sta_rx_bandwidth max_bw, new_bw; struct ieee80211_supported_band *sband; struct sta_opmode_info sta_opmode = {}; lockdep_assert_wiphy(local->hw.wiphy); if (chanwidth == IEEE80211_HT_CHANWIDTH_20MHZ) max_bw = IEEE80211_STA_RX_BW_20; else max_bw = ieee80211_sta_cap_rx_bw(link_sta); /* set cur_max_bandwidth and recalc sta bw */ link_sta->cur_max_bandwidth = max_bw; new_bw = ieee80211_sta_cur_vht_bw(link_sta); if (link_sta->pub->bandwidth == new_bw) return; link_sta->pub->bandwidth = new_bw; sband = local->hw.wiphy->bands[band]; sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(link_sta); sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED; rate_control_rate_update(local, sband, link_sta, IEEE80211_RC_BW_CHANGED); cfg80211_sta_opmode_change_notify(sdata->dev, sta->addr, &sta_opmode, GFP_KERNEL); } |
| 2811 3208 3210 10821 10825 3200 3216 3202 2646 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | // SPDX-License-Identifier: GPL-2.0-or-later /* * printk_safe.c - Safe printk for printk-deadlock-prone contexts */ #include <linux/preempt.h> #include <linux/kdb.h> #include <linux/smp.h> #include <linux/cpumask.h> #include <linux/printk.h> #include <linux/kprobes.h> #include "internal.h" /* Context where printk messages are never suppressed */ static atomic_t force_con; void printk_force_console_enter(void) { atomic_inc(&force_con); } void printk_force_console_exit(void) { atomic_dec(&force_con); } bool is_printk_force_console(void) { return atomic_read(&force_con); } static DEFINE_PER_CPU(int, printk_context); /* Can be preempted by NMI. */ void __printk_safe_enter(void) { this_cpu_inc(printk_context); } /* Can be preempted by NMI. */ void __printk_safe_exit(void) { this_cpu_dec(printk_context); } void __printk_deferred_enter(void) { cant_migrate(); __printk_safe_enter(); } void __printk_deferred_exit(void) { cant_migrate(); __printk_safe_exit(); } bool is_printk_legacy_deferred(void) { /* * The per-CPU variable @printk_context can be read safely in any * context. CPU migration is always disabled when set. * * A context holding the printk_cpu_sync must not spin waiting for * another CPU. For legacy printing, it could be the console_lock * or the port lock. */ return (force_legacy_kthread() || this_cpu_read(printk_context) || in_nmi() || is_printk_cpu_sync_owner()); } asmlinkage int vprintk(const char *fmt, va_list args) { #ifdef CONFIG_KGDB_KDB /* Allow to pass printk() to kdb but avoid a recursion. */ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); #endif return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); |
| 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 | // SPDX-License-Identifier: GPL-2.0-or-later /* Asymmetric public-key cryptography key type * * See Documentation/crypto/asymmetric-keys.rst * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <keys/asymmetric-subtype.h> #include <keys/asymmetric-parser.h> #include <crypto/public_key.h> #include <linux/seq_file.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/ctype.h> #include <keys/system_keyring.h> #include <keys/user-type.h> #include "asymmetric_keys.h" static LIST_HEAD(asymmetric_key_parsers); static DECLARE_RWSEM(asymmetric_key_parsers_sem); /** * find_asymmetric_key - Find a key by ID. * @keyring: The keys to search. * @id_0: The first ID to look for or NULL. * @id_1: The second ID to look for or NULL, matched together with @id_0 * against @keyring keys' id[0] and id[1]. * @id_2: The fallback ID to match against @keyring keys' id[2] if both of the * other IDs are NULL. * @partial: Use partial match for @id_0 and @id_1 if true, exact if false. * * Find a key in the given keyring by identifier. The preferred identifier is * the id_0 and the fallback identifier is the id_1. If both are given, the * former is matched (exactly or partially) against either of the sought key's * identifiers and the latter must match the found key's second identifier * exactly. If both are missing, id_2 must match the sought key's third * identifier exactly. */ struct key *find_asymmetric_key(struct key *keyring, const struct asymmetric_key_id *id_0, const struct asymmetric_key_id *id_1, const struct asymmetric_key_id *id_2, bool partial) { struct key *key; key_ref_t ref; const char *lookup; char *req, *p; int len; if (id_0) { lookup = id_0->data; len = id_0->len; } else if (id_1) { lookup = id_1->data; len = id_1->len; } else if (id_2) { lookup = id_2->data; len = id_2->len; } else { WARN_ON(1); return ERR_PTR(-EINVAL); } /* Construct an identifier "id:<keyid>". */ p = req = kmalloc(2 + 1 + len * 2 + 1, GFP_KERNEL); if (!req) return ERR_PTR(-ENOMEM); if (!id_0 && !id_1) { *p++ = 'd'; *p++ = 'n'; } else if (partial) { *p++ = 'i'; *p++ = 'd'; } else { *p++ = 'e'; *p++ = 'x'; } *p++ = ':'; p = bin2hex(p, lookup, len); *p = 0; pr_debug("Look up: \"%s\"\n", req); ref = keyring_search(make_key_ref(keyring, 1), &key_type_asymmetric, req, true); if (IS_ERR(ref)) pr_debug("Request for key '%s' err %ld\n", req, PTR_ERR(ref)); kfree(req); if (IS_ERR(ref)) { switch (PTR_ERR(ref)) { /* Hide some search errors */ case -EACCES: case -ENOTDIR: case -EAGAIN: return ERR_PTR(-ENOKEY); default: return ERR_CAST(ref); } } key = key_ref_to_ptr(ref); if (id_0 && id_1) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); if (!kids->id[1]) { pr_debug("First ID matches, but second is missing\n"); goto reject; } if (!asymmetric_key_id_same(id_1, kids->id[1])) { pr_debug("First ID matches, but second does not\n"); goto reject; } } pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key)); return key; reject: key_put(key); return ERR_PTR(-EKEYREJECTED); } EXPORT_SYMBOL_GPL(find_asymmetric_key); /** * asymmetric_key_generate_id: Construct an asymmetric key ID * @val_1: First binary blob * @len_1: Length of first binary blob * @val_2: Second binary blob * @len_2: Length of second binary blob * * Construct an asymmetric key ID from a pair of binary blobs. */ struct asymmetric_key_id *asymmetric_key_generate_id(const void *val_1, size_t len_1, const void *val_2, size_t len_2) { struct asymmetric_key_id *kid; kid = kmalloc(sizeof(struct asymmetric_key_id) + len_1 + len_2, GFP_KERNEL); if (!kid) return ERR_PTR(-ENOMEM); kid->len = len_1 + len_2; memcpy(kid->data, val_1, len_1); memcpy(kid->data + len_1, val_2, len_2); return kid; } EXPORT_SYMBOL_GPL(asymmetric_key_generate_id); /** * asymmetric_key_id_same - Return true if two asymmetric keys IDs are the same. * @kid1: The key ID to compare * @kid2: The key ID to compare */ bool asymmetric_key_id_same(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2) { if (!kid1 || !kid2) return false; if (kid1->len != kid2->len) return false; return memcmp(kid1->data, kid2->data, kid1->len) == 0; } EXPORT_SYMBOL_GPL(asymmetric_key_id_same); /** * asymmetric_key_id_partial - Return true if two asymmetric keys IDs * partially match * @kid1: The key ID to compare * @kid2: The key ID to compare */ bool asymmetric_key_id_partial(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2) { if (!kid1 || !kid2) return false; if (kid1->len < kid2->len) return false; return memcmp(kid1->data + (kid1->len - kid2->len), kid2->data, kid2->len) == 0; } EXPORT_SYMBOL_GPL(asymmetric_key_id_partial); /** * asymmetric_match_key_ids - Search asymmetric key IDs 1 & 2 * @kids: The pair of key IDs to check * @match_id: The key ID we're looking for * @match: The match function to use */ static bool asymmetric_match_key_ids( const struct asymmetric_key_ids *kids, const struct asymmetric_key_id *match_id, bool (*match)(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2)) { int i; if (!kids || !match_id) return false; for (i = 0; i < 2; i++) if (match(kids->id[i], match_id)) return true; return false; } /* helper function can be called directly with pre-allocated memory */ inline int __asymmetric_key_hex_to_key_id(const char *id, struct asymmetric_key_id *match_id, size_t hexlen) { match_id->len = hexlen; return hex2bin(match_id->data, id, hexlen); } /** * asymmetric_key_hex_to_key_id - Convert a hex string into a key ID. * @id: The ID as a hex string. */ struct asymmetric_key_id *asymmetric_key_hex_to_key_id(const char *id) { struct asymmetric_key_id *match_id; size_t asciihexlen; int ret; if (!*id) return ERR_PTR(-EINVAL); asciihexlen = strlen(id); if (asciihexlen & 1) return ERR_PTR(-EINVAL); match_id = kmalloc(sizeof(struct asymmetric_key_id) + asciihexlen / 2, GFP_KERNEL); if (!match_id) return ERR_PTR(-ENOMEM); ret = __asymmetric_key_hex_to_key_id(id, match_id, asciihexlen / 2); if (ret < 0) { kfree(match_id); return ERR_PTR(-EINVAL); } return match_id; } /* * Match asymmetric keys by an exact match on one of the first two IDs. */ static bool asymmetric_key_cmp(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return asymmetric_match_key_ids(kids, match_id, asymmetric_key_id_same); } /* * Match asymmetric keys by a partial match on one of the first two IDs. */ static bool asymmetric_key_cmp_partial(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return asymmetric_match_key_ids(kids, match_id, asymmetric_key_id_partial); } /* * Match asymmetric keys by an exact match on the third IDs. */ static bool asymmetric_key_cmp_name(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return kids && asymmetric_key_id_same(kids->id[2], match_id); } /* * Preparse the match criterion. If we don't set lookup_type and cmp, * the default will be an exact match on the key description. * * There are some specifiers for matching key IDs rather than by the key * description: * * "id:<id>" - find a key by partial match on one of the first two IDs * "ex:<id>" - find a key by exact match on one of the first two IDs * "dn:<id>" - find a key by exact match on the third ID * * These have to be searched by iteration rather than by direct lookup because * the key is hashed according to its description. */ static int asymmetric_key_match_preparse(struct key_match_data *match_data) { struct asymmetric_key_id *match_id; const char *spec = match_data->raw_data; const char *id; bool (*cmp)(const struct key *, const struct key_match_data *) = asymmetric_key_cmp; if (!spec || !*spec) return -EINVAL; if (spec[0] == 'i' && spec[1] == 'd' && spec[2] == ':') { id = spec + 3; cmp = asymmetric_key_cmp_partial; } else if (spec[0] == 'e' && spec[1] == 'x' && spec[2] == ':') { id = spec + 3; } else if (spec[0] == 'd' && spec[1] == 'n' && spec[2] == ':') { id = spec + 3; cmp = asymmetric_key_cmp_name; } else { goto default_match; } match_id = asymmetric_key_hex_to_key_id(id); if (IS_ERR(match_id)) return PTR_ERR(match_id); match_data->preparsed = match_id; match_data->cmp = cmp; match_data->lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE; return 0; default_match: return 0; } /* * Free the preparsed the match criterion. */ static void asymmetric_key_match_free(struct key_match_data *match_data) { kfree(match_data->preparsed); } /* * Describe the asymmetric key */ static void asymmetric_key_describe(const struct key *key, struct seq_file *m) { const struct asymmetric_key_subtype *subtype = asymmetric_key_subtype(key); const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *kid; const unsigned char *p; int n; seq_puts(m, key->description); if (subtype) { seq_puts(m, ": "); subtype->describe(key, m); if (kids && kids->id[1]) { kid = kids->id[1]; seq_putc(m, ' '); n = kid->len; p = kid->data; if (n > 4) { p += n - 4; n = 4; } seq_printf(m, "%*phN", n, p); } seq_puts(m, " ["); /* put something here to indicate the key's capabilities */ seq_putc(m, ']'); } } /* * Preparse a asymmetric payload to get format the contents appropriately for the * internal payload to cut down on the number of scans of the data performed. * * We also generate a proposed description from the contents of the key that * can be used to name the key if the user doesn't want to provide one. */ static int asymmetric_key_preparse(struct key_preparsed_payload *prep) { struct asymmetric_key_parser *parser; int ret; pr_devel("==>%s()\n", __func__); if (prep->datalen == 0) return -EINVAL; down_read(&asymmetric_key_parsers_sem); ret = -EBADMSG; list_for_each_entry(parser, &asymmetric_key_parsers, link) { pr_debug("Trying parser '%s'\n", parser->name); ret = parser->parse(prep); if (ret != -EBADMSG) { pr_debug("Parser recognised the format (ret %d)\n", ret); break; } } up_read(&asymmetric_key_parsers_sem); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /* * Clean up the key ID list */ static void asymmetric_key_free_kids(struct asymmetric_key_ids *kids) { int i; if (kids) { for (i = 0; i < ARRAY_SIZE(kids->id); i++) kfree(kids->id[i]); kfree(kids); } } /* * Clean up the preparse data */ static void asymmetric_key_free_preparse(struct key_preparsed_payload *prep) { struct asymmetric_key_subtype *subtype = prep->payload.data[asym_subtype]; struct asymmetric_key_ids *kids = prep->payload.data[asym_key_ids]; pr_devel("==>%s()\n", __func__); if (subtype) { subtype->destroy(prep->payload.data[asym_crypto], prep->payload.data[asym_auth]); module_put(subtype->owner); } asymmetric_key_free_kids(kids); kfree(prep->description); } /* * dispose of the data dangling from the corpse of a asymmetric key */ static void asymmetric_key_destroy(struct key *key) { struct asymmetric_key_subtype *subtype = asymmetric_key_subtype(key); struct asymmetric_key_ids *kids = key->payload.data[asym_key_ids]; void *data = key->payload.data[asym_crypto]; void *auth = key->payload.data[asym_auth]; key->payload.data[asym_crypto] = NULL; key->payload.data[asym_subtype] = NULL; key->payload.data[asym_key_ids] = NULL; key->payload.data[asym_auth] = NULL; if (subtype) { subtype->destroy(data, auth); module_put(subtype->owner); } asymmetric_key_free_kids(kids); } static struct key_restriction *asymmetric_restriction_alloc( key_restrict_link_func_t check, struct key *key) { struct key_restriction *keyres = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; keyres->key = key; keyres->keytype = &key_type_asymmetric; return keyres; } /* * look up keyring restrict functions for asymmetric keys */ static struct key_restriction *asymmetric_lookup_restriction( const char *restriction) { char *restrict_method; char *parse_buf; char *next; struct key_restriction *ret = ERR_PTR(-EINVAL); if (strcmp("builtin_trusted", restriction) == 0) return asymmetric_restriction_alloc( restrict_link_by_builtin_trusted, NULL); if (strcmp("builtin_and_secondary_trusted", restriction) == 0) return asymmetric_restriction_alloc( restrict_link_by_builtin_and_secondary_trusted, NULL); parse_buf = kstrndup(restriction, PAGE_SIZE, GFP_KERNEL); if (!parse_buf) return ERR_PTR(-ENOMEM); next = parse_buf; restrict_method = strsep(&next, ":"); if ((strcmp(restrict_method, "key_or_keyring") == 0) && next) { char *key_text; key_serial_t serial; struct key *key; key_restrict_link_func_t link_fn = restrict_link_by_key_or_keyring; bool allow_null_key = false; key_text = strsep(&next, ":"); if (next) { if (strcmp(next, "chain") != 0) goto out; link_fn = restrict_link_by_key_or_keyring_chain; allow_null_key = true; } if (kstrtos32(key_text, 0, &serial) < 0) goto out; if ((serial == 0) && allow_null_key) { key = NULL; } else { key = key_lookup(serial); if (IS_ERR(key)) { ret = ERR_CAST(key); goto out; } } ret = asymmetric_restriction_alloc(link_fn, key); if (IS_ERR(ret)) key_put(key); } out: kfree(parse_buf); return ret; } int asymmetric_key_eds_op(struct kernel_pkey_params *params, const void *in, void *out) { const struct asymmetric_key_subtype *subtype; struct key *key = params->key; int ret; pr_devel("==>%s()\n", __func__); if (key->type != &key_type_asymmetric) return -EINVAL; subtype = asymmetric_key_subtype(key); if (!subtype || !key->payload.data[0]) return -EINVAL; if (!subtype->eds_op) return -ENOTSUPP; ret = subtype->eds_op(params, in, out); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } static int asymmetric_key_verify_signature(struct kernel_pkey_params *params, const void *in, const void *in2) { struct public_key_signature sig = { .s_size = params->in2_len, .digest_size = params->in_len, .encoding = params->encoding, .hash_algo = params->hash_algo, .digest = (void *)in, .s = (void *)in2, }; return verify_signature(params->key, &sig); } struct key_type key_type_asymmetric = { .name = "asymmetric", .preparse = asymmetric_key_preparse, .free_preparse = asymmetric_key_free_preparse, .instantiate = generic_key_instantiate, .match_preparse = asymmetric_key_match_preparse, .match_free = asymmetric_key_match_free, .destroy = asymmetric_key_destroy, .describe = asymmetric_key_describe, .lookup_restriction = asymmetric_lookup_restriction, .asym_query = query_asymmetric_key, .asym_eds_op = asymmetric_key_eds_op, .asym_verify_signature = asymmetric_key_verify_signature, }; EXPORT_SYMBOL_GPL(key_type_asymmetric); /** * register_asymmetric_key_parser - Register a asymmetric key blob parser * @parser: The parser to register */ int register_asymmetric_key_parser(struct asymmetric_key_parser *parser) { struct asymmetric_key_parser *cursor; int ret; down_write(&asymmetric_key_parsers_sem); list_for_each_entry(cursor, &asymmetric_key_parsers, link) { if (strcmp(cursor->name, parser->name) == 0) { pr_err("Asymmetric key parser '%s' already registered\n", parser->name); ret = -EEXIST; goto out; } } list_add_tail(&parser->link, &asymmetric_key_parsers); pr_notice("Asymmetric key parser '%s' registered\n", parser->name); ret = 0; out: up_write(&asymmetric_key_parsers_sem); return ret; } EXPORT_SYMBOL_GPL(register_asymmetric_key_parser); /** * unregister_asymmetric_key_parser - Unregister a asymmetric key blob parser * @parser: The parser to unregister */ void unregister_asymmetric_key_parser(struct asymmetric_key_parser *parser) { down_write(&asymmetric_key_parsers_sem); list_del(&parser->link); up_write(&asymmetric_key_parsers_sem); pr_notice("Asymmetric key parser '%s' unregistered\n", parser->name); } EXPORT_SYMBOL_GPL(unregister_asymmetric_key_parser); /* * Module stuff */ static int __init asymmetric_key_init(void) { return register_key_type(&key_type_asymmetric); } static void __exit asymmetric_key_cleanup(void) { unregister_key_type(&key_type_asymmetric); } module_init(asymmetric_key_init); module_exit(asymmetric_key_cleanup); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Universal TUN/TAP device driver. * Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com> */ #ifndef __IF_TUN_H #define __IF_TUN_H #include <uapi/linux/if_tun.h> #include <uapi/linux/virtio_net.h> #define TUN_XDP_FLAG 0x1UL #define TUN_MSG_UBUF 1 #define TUN_MSG_PTR 2 struct tun_msg_ctl { unsigned short type; unsigned short num; void *ptr; }; #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); static inline bool tun_is_xdp_frame(void *ptr) { return (unsigned long)ptr & TUN_XDP_FLAG; } static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return (void *)((unsigned long)xdp | TUN_XDP_FLAG); } static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); } void tun_ptr_free(void *ptr); #else #include <linux/err.h> #include <linux/errno.h> struct file; struct socket; static inline struct socket *tun_get_socket(struct file *f) { return ERR_PTR(-EINVAL); } static inline struct ptr_ring *tun_get_tx_ring(struct file *f) { return ERR_PTR(-EINVAL); } static inline bool tun_is_xdp_frame(void *ptr) { return false; } static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return NULL; } static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return NULL; } static inline void tun_ptr_free(void *ptr) { } #endif /* CONFIG_TUN */ #endif /* __IF_TUN_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | /* * Copyright (C) 2008, VMware, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * */ #ifndef _ASM_X86_HYPERVISOR_H #define _ASM_X86_HYPERVISOR_H /* x86 hypervisor types */ enum x86_hypervisor_type { X86_HYPER_NATIVE = 0, X86_HYPER_VMWARE, X86_HYPER_MS_HYPERV, X86_HYPER_XEN_PV, X86_HYPER_XEN_HVM, X86_HYPER_KVM, X86_HYPER_JAILHOUSE, X86_HYPER_ACRN, X86_HYPER_BHYVE, }; #ifdef CONFIG_HYPERVISOR_GUEST #include <asm/kvm_para.h> #include <asm/x86_init.h> #include <asm/xen/hypervisor.h> struct hypervisor_x86 { /* Hypervisor name */ const char *name; /* Detection routine */ uint32_t (*detect)(void); /* Hypervisor type */ enum x86_hypervisor_type type; /* init time callbacks */ struct x86_hyper_init init; /* runtime callbacks */ struct x86_hyper_runtime runtime; /* ignore nopv parameter */ bool ignore_nopv; }; extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; extern const struct hypervisor_x86 x86_hyper_xen_pv; extern const struct hypervisor_x86 x86_hyper_kvm; extern const struct hypervisor_x86 x86_hyper_jailhouse; extern const struct hypervisor_x86 x86_hyper_acrn; extern const struct hypervisor_x86 x86_hyper_bhyve; extern struct hypervisor_x86 x86_hyper_xen_hvm; extern bool nopv; extern enum x86_hypervisor_type x86_hyper_type; extern void init_hypervisor_platform(void); static inline bool hypervisor_is_type(enum x86_hypervisor_type type) { return x86_hyper_type == type; } #else static inline void init_hypervisor_platform(void) { } static inline bool hypervisor_is_type(enum x86_hypervisor_type type) { return type == X86_HYPER_NATIVE; } #endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* _ASM_X86_HYPERVISOR_H */ |
| 97 686 104 59 59 6 6 59 22 111 42 43 42 36 43 7 99 96 97 99 4 4 2 4 2 2 2 2 2 2 2 82 83 82 125 213 125 35 35 35 9 21 1 1 1 33 33 33 33 33 75 19 19 1 19 6 20 431 430 431 62 61 62 9 53 53 62 62 9 62 53 4 53 53 53 53 9 9 12 10 10 10 10 10 10 10 59 59 59 59 99 58 98 57 139 70 139 99 139 20 139 17 17 17 17 155 154 155 155 25 25 155 13 12 144 143 143 157 101 93 142 142 140 140 139 139 90 99 98 99 99 98 60 143 9 10 133 141 142 142 140 17 123 140 139 9 6 6 140 140 139 138 140 140 2 2 2 143 14 7 6 3 14 2 25 25 25 25 14 14 14 1 12 10 1 1 10 10 10 10 104 102 67 99 98 99 10 99 25 76 98 99 37 34 25 5 4 4 5 37 24 23 24 24 2 1 1 1 41 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright (C) 2016 - 2020 Christoph Hellwig */ #include <linux/init.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> #include <linux/device_cgroup.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/backing-dev.h> #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/part_stat.h> #include <linux/uaccess.h> #include <linux/stat.h> #include "../fs/internal.h" #include "blk.h" /* Should we allow writing to mounted block devices? */ static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); struct bdev_inode { struct block_device bdev; struct inode vfs_inode; }; static inline struct bdev_inode *BDEV_I(struct inode *inode) { return container_of(inode, struct bdev_inode, vfs_inode); } static inline struct inode *BD_INODE(struct block_device *bdev) { return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode; } struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } EXPORT_SYMBOL(I_BDEV); struct block_device *file_bdev(struct file *bdev_file) { return I_BDEV(bdev_file->f_mapping->host); } EXPORT_SYMBOL(file_bdev); static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = BD_INODE(bdev); int ret; spin_lock(&inode->i_lock); while (inode_state_read(inode) & I_DIRTY) { spin_unlock(&inode->i_lock); ret = write_inode_now(inode, true); if (ret) pr_warn_ratelimited( "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", bdev, ret); spin_lock(&inode->i_lock); } spin_unlock(&inode->i_lock); } /* Kill _all_ buffers and pagecache , dirty or not.. */ static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_mapping; if (mapping_empty(mapping)) return; invalidate_bh_lrus(); truncate_inode_pages(mapping, 0); } /* Invalidate clean unused buffers and pagecache. */ void invalidate_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_mapping; if (mapping->nrpages) { invalidate_bh_lrus(); lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } } EXPORT_SYMBOL(invalidate_bdev); /* * Drop all buffers & page cache for given bdev range. This function bails * with error if bdev has other exclusive owner (such as filesystem). */ int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, loff_t lstart, loff_t lend) { /* * If we don't hold exclusive handle for the device, upgrade to it * while we discard the buffer cache to avoid discarding buffers * under live filesystem. */ if (!(mode & BLK_OPEN_EXCL)) { int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); if (err) goto invalidate; } truncate_inode_pages_range(bdev->bd_mapping, lstart, lend); if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, truncate_bdev_range); return 0; invalidate: /* * Someone else has handle exclusively open. Try invalidating instead. * The 'end' argument is inclusive so the rounding is safe. */ return invalidate_inode_pages2_range(bdev->bd_mapping, lstart >> PAGE_SHIFT, lend >> PAGE_SHIFT); } static void set_init_blocksize(struct block_device *bdev) { unsigned int bsize = bdev_logical_block_size(bdev); loff_t size = i_size_read(BD_INODE(bdev)); while (bsize < PAGE_SIZE) { if (size & bsize) break; bsize <<= 1; } BD_INODE(bdev)->i_blkbits = blksize_bits(bsize); mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping, get_order(bsize)); } /** * bdev_validate_blocksize - check that this block size is acceptable * @bdev: blockdevice to check * @block_size: block size to check * * For block device users that do not use buffer heads or the block device * page cache, make sure that this block size can be used with the device. * * Return: On success zero is returned, negative error code on failure. */ int bdev_validate_blocksize(struct block_device *bdev, int block_size) { if (blk_validate_block_size(block_size)) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ if (block_size < bdev_logical_block_size(bdev)) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(bdev_validate_blocksize); int set_blocksize(struct file *file, int size) { struct inode *inode = file->f_mapping->host; struct block_device *bdev = I_BDEV(inode); int ret; ret = bdev_validate_blocksize(bdev, size); if (ret) return ret; if (!file->private_data) return -EINVAL; /* Don't change the size if it is same as current */ if (inode->i_blkbits != blksize_bits(size)) { /* * Flush and truncate the pagecache before we reconfigure the * mapping geometry because folio sizes are variable now. If a * reader has already allocated a folio whose size is smaller * than the new min_order but invokes readahead after the new * min_order becomes visible, readahead will think there are * "zero" blocks per folio and crash. Take the inode and * invalidation locks to avoid racing with * read/write/fallocate. */ inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); sync_blockdev(bdev); kill_bdev(bdev); inode->i_blkbits = blksize_bits(size); mapping_set_folio_min_order(inode->i_mapping, get_order(size)); kill_bdev(bdev); filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); } return 0; } EXPORT_SYMBOL(set_blocksize); static int sb_validate_large_blocksize(struct super_block *sb, int size) { const char *err_str = NULL; if (!(sb->s_type->fs_flags & FS_LBS)) err_str = "not supported by filesystem"; else if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) err_str = "is only supported with CONFIG_TRANSPARENT_HUGEPAGE"; if (!err_str) return 0; pr_warn_ratelimited("%s: block size(%d) > page size(%lu) %s\n", sb->s_type->name, size, PAGE_SIZE, err_str); return -EINVAL; } int sb_set_blocksize(struct super_block *sb, int size) { if (size > PAGE_SIZE && sb_validate_large_blocksize(sb, size)) return 0; if (set_blocksize(sb->s_bdev_file, size)) return 0; /* If we get here, we know size is validated */ sb->s_blocksize = size; sb->s_blocksize_bits = blksize_bits(size); return sb->s_blocksize; } EXPORT_SYMBOL(sb_set_blocksize); int __must_check sb_min_blocksize(struct super_block *sb, int size) { int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) size = minsize; return sb_set_blocksize(sb, size); } EXPORT_SYMBOL(sb_min_blocksize); int sync_blockdev_nowait(struct block_device *bdev) { if (!bdev) return 0; return filemap_flush(bdev->bd_mapping); } EXPORT_SYMBOL_GPL(sync_blockdev_nowait); /* * Write out and wait upon all the dirty data associated with a block * device via its mapping. Does not take the superblock lock. */ int sync_blockdev(struct block_device *bdev) { if (!bdev) return 0; return filemap_write_and_wait(bdev->bd_mapping); } EXPORT_SYMBOL(sync_blockdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) { return filemap_write_and_wait_range(bdev->bd_mapping, lstart, lend); } EXPORT_SYMBOL(sync_blockdev_range); /** * bdev_freeze - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last * unfreeze process can unfreeze the frozen filesystem actually when multiple * freeze requests arrive simultaneously. It counts up in bdev_freeze() and * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze * actually. * * Return: On success zero is returned, negative error code on failure. */ int bdev_freeze(struct block_device *bdev) { int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { error = bdev->bd_holder_ops->freeze(bdev); lockdep_assert_not_held(&bdev->bd_holder_lock); } else { mutex_unlock(&bdev->bd_holder_lock); error = sync_blockdev(bdev); } if (error) atomic_dec(&bdev->bd_fsfreeze_count); mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } EXPORT_SYMBOL(bdev_freeze); /** * bdev_thaw - unlock filesystem * @bdev: blockdevice to unlock * * Unlocks the filesystem and marks it writeable again after bdev_freeze(). * * Return: On success zero is returned, negative error code on failure. */ int bdev_thaw(struct block_device *bdev) { int error = -EINVAL, nr_freeze; mutex_lock(&bdev->bd_fsfreeze_mutex); /* * If this returns < 0 it means that @bd_fsfreeze_count was * already 0 and no decrement was performed. */ nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); if (nr_freeze < 0) goto out; error = 0; if (nr_freeze > 0) goto out; mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { error = bdev->bd_holder_ops->thaw(bdev); lockdep_assert_not_held(&bdev->bd_holder_lock); } else { mutex_unlock(&bdev->bd_holder_lock); } if (error) atomic_inc(&bdev->bd_fsfreeze_count); out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } EXPORT_SYMBOL(bdev_thaw); /* * pseudo-fs */ static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); static struct kmem_cache *bdev_cachep __ro_after_init; static struct inode *bdev_alloc_inode(struct super_block *sb) { struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); if (!ei) return NULL; memset(&ei->bdev, 0, sizeof(ei->bdev)); if (security_bdev_alloc(&ei->bdev)) { kmem_cache_free(bdev_cachep, ei); return NULL; } return &ei->vfs_inode; } static void bdev_free_inode(struct inode *inode) { struct block_device *bdev = I_BDEV(inode); free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); security_bdev_free(bdev); if (!bdev_is_partition(bdev)) { if (bdev->bd_disk && bdev->bd_disk->bdi) bdi_put(bdev->bd_disk->bdi); kfree(bdev->bd_disk); } if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(bdev->bd_dev)); kmem_cache_free(bdev_cachep, BDEV_I(inode)); } static void init_once(void *data) { struct bdev_inode *ei = data; inode_init_once(&ei->vfs_inode); } static void bdev_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); } static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, .drop_inode = inode_just_drop, .evict_inode = bdev_evict_inode, }; static int bd_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); if (!ctx) return -ENOMEM; fc->s_iflags |= SB_I_CGROUPWB; ctx->ops = &bdev_sops; return 0; } static struct file_system_type bd_type = { .name = "bdev", .init_fs_context = bd_init_fs_context, .kill_sb = kill_anon_super, }; struct super_block *blockdev_superblock __ro_after_init; static struct vfsmount *blockdev_mnt __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { int err; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) panic("Cannot register bdev pseudo-fs"); blockdev_mnt = kern_mount(&bd_type); if (IS_ERR(blockdev_mnt)) panic("Cannot create bdev pseudo-fs"); blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ } struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) { struct block_device *bdev; struct inode *inode; inode = new_inode(blockdev_superblock); if (!inode) return NULL; inode->i_mode = S_IFBLK; inode->i_rdev = 0; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); bdev = I_BDEV(inode); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); mutex_init(&bdev->bd_holder_lock); atomic_set(&bdev->__bd_flags, partno); bdev->bd_mapping = &inode->i_data; bdev->bd_queue = disk->queue; if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO)) bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO); bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); return NULL; } bdev->bd_disk = disk; return bdev; } void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) { spin_lock(&bdev->bd_size_lock); i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT); bdev->bd_nr_sectors = sectors; spin_unlock(&bdev->bd_size_lock); } void bdev_add(struct block_device *bdev, dev_t dev) { struct inode *inode = BD_INODE(bdev); if (bdev_stable_writes(bdev)) mapping_set_stable_writes(bdev->bd_mapping); bdev->bd_dev = dev; inode->i_rdev = dev; inode->i_ino = dev; insert_inode_hash(inode); } void bdev_unhash(struct block_device *bdev) { remove_inode_hash(BD_INODE(bdev)); } void bdev_drop(struct block_device *bdev) { iput(BD_INODE(bdev)); } long nr_blockdev_pages(void) { struct inode *inode; long ret = 0; spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) ret += inode->i_mapping->nrpages; spin_unlock(&blockdev_superblock->s_inode_list_lock); return ret; } /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest * @holder: holder trying to claim @bdev * @hops: holder ops * * Test whether @bdev can be claimed by @holder. * * RETURNS: * %true if @bdev can be claimed, %false otherwise. */ static bool bd_may_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); lockdep_assert_held(&bdev_lock); if (bdev->bd_holder) { /* * The same holder can always re-claim. */ if (bdev->bd_holder == holder) { if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) return false; return true; } return false; } /* * If the whole devices holder is set to bd_may_claim, a partition on * the device is claimed, but not the whole device. */ if (whole != bdev && whole->bd_holder && whole->bd_holder != bd_may_claim) return false; return true; } /** * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @holder: holder trying to claim @bdev * @hops: holder ops. * * Claim @bdev. This function fails if @bdev is already claimed by another * holder and waits if another claiming is in progress. return, the caller * has ownership of bd_claiming and bd_holder[s]. * * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); if (WARN_ON_ONCE(!holder)) return -EINVAL; retry: mutex_lock(&bdev_lock); /* if someone else claimed, fail */ if (!bd_may_claim(bdev, holder, hops)) { mutex_unlock(&bdev_lock); return -EBUSY; } /* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming); DEFINE_WAIT(wait); prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); goto retry; } /* yay, all mine */ whole->bd_claiming = holder; mutex_unlock(&bdev_lock); return 0; } EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ static void bd_clear_claiming(struct block_device *whole, void *holder) { lockdep_assert_held(&bdev_lock); /* tell others that we're done */ BUG_ON(whole->bd_claiming != holder); whole->bd_claiming = NULL; wake_up_var(&whole->bd_claiming); } /** * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest * @holder: holder that has claimed @bdev * @hops: block device holder operations * * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ static void bd_finish_claiming(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); mutex_lock(&bdev_lock); BUG_ON(!bd_may_claim(bdev, holder, hops)); /* * Note that for a whole device bd_holders will be incremented twice, * and bd_holder will be set to bd_may_claim before being set to holder */ whole->bd_holders++; whole->bd_holder = bd_may_claim; bdev->bd_holders++; mutex_lock(&bdev->bd_holder_lock); bdev->bd_holder = holder; bdev->bd_holder_ops = hops; mutex_unlock(&bdev->bd_holder_lock); bd_clear_claiming(whole, holder); mutex_unlock(&bdev_lock); } /** * bd_abort_claiming - abort claiming of a block device * @bdev: block device of interest * @holder: holder that has claimed @bdev * * Abort claiming of a block device when the exclusive open failed. This can be * also used when exclusive open is not actually desired and we just needed * to block other exclusive openers for a while. */ void bd_abort_claiming(struct block_device *bdev, void *holder) { mutex_lock(&bdev_lock); bd_clear_claiming(bdev_whole(bdev), holder); mutex_unlock(&bdev_lock); } EXPORT_SYMBOL(bd_abort_claiming); static void bd_end_claim(struct block_device *bdev, void *holder) { struct block_device *whole = bdev_whole(bdev); bool unblock = false; /* * Release a claim on the device. The holder fields are protected with * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. */ mutex_lock(&bdev_lock); WARN_ON_ONCE(bdev->bd_holder != holder); WARN_ON_ONCE(--bdev->bd_holders < 0); WARN_ON_ONCE(--whole->bd_holders < 0); if (!bdev->bd_holders) { mutex_lock(&bdev->bd_holder_lock); bdev->bd_holder = NULL; bdev->bd_holder_ops = NULL; mutex_unlock(&bdev->bd_holder_lock); if (bdev_test_flag(bdev, BD_WRITE_HOLDER)) unblock = true; } if (!whole->bd_holders) whole->bd_holder = NULL; mutex_unlock(&bdev_lock); /* * If this was the last claim, remove holder link and unblock evpoll if * it was a write holder. */ if (unblock) { disk_unblock_events(bdev->bd_disk); bdev_clear_flag(bdev, BD_WRITE_HOLDER); } } static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); bdev_write_inode(bdev); } static void blkdev_put_whole(struct block_device *bdev) { if (atomic_dec_and_test(&bdev->bd_openers)) blkdev_flush_mapping(bdev); if (bdev->bd_disk->fops->release) bdev->bd_disk->fops->release(bdev->bd_disk); } static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) { struct gendisk *disk = bdev->bd_disk; int ret; if (disk->fops->open) { ret = disk->fops->open(disk, mode); if (ret) { /* avoid ghost partitions on a removed medium */ if (ret == -ENOMEDIUM && test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, true); return ret; } } if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); atomic_inc(&bdev->bd_openers); if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { /* * Only return scanning errors if we are called from contexts * that explicitly want them, e.g. the BLKRRPART ioctl. */ ret = bdev_disk_changed(disk, false); if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { blkdev_put_whole(bdev); return ret; } } return 0; } static int blkdev_get_part(struct block_device *part, blk_mode_t mode) { struct gendisk *disk = part->bd_disk; int ret; ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) return ret; ret = -ENXIO; if (!bdev_nr_sectors(part)) goto out_blkdev_put; if (!atomic_read(&part->bd_openers)) { disk->open_partitions++; set_init_blocksize(part); } atomic_inc(&part->bd_openers); return 0; out_blkdev_put: blkdev_put_whole(bdev_whole(part)); return ret; } int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) { int ret; ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, MAJOR(dev), MINOR(dev), ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) return ret; /* Blocking writes requires exclusive opener */ if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) return -EINVAL; /* * We're using error pointers to indicate to ->release() when we * failed to open that block device. Also this doesn't make sense. */ if (WARN_ON_ONCE(IS_ERR(holder))) return -EINVAL; return 0; } static void blkdev_put_part(struct block_device *part) { struct block_device *whole = bdev_whole(part); if (atomic_dec_and_test(&part->bd_openers)) { blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; } blkdev_put_whole(whole); } struct block_device *blkdev_get_no_open(dev_t dev, bool autoload) { struct block_device *bdev; struct inode *inode; inode = ilookup(blockdev_superblock, dev); if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { blk_request_module(dev); inode = ilookup(blockdev_superblock, dev); if (inode) pr_warn_ratelimited( "block device autoloading is deprecated and will be removed.\n"); } if (!inode) return NULL; /* switch from the inode reference to a device mode one: */ bdev = &BDEV_I(inode)->bdev; if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) bdev = NULL; iput(inode); return bdev; } void blkdev_put_no_open(struct block_device *bdev) { put_device(&bdev->bd_device); } static bool bdev_writes_blocked(struct block_device *bdev) { return bdev->bd_writers < 0; } static void bdev_block_writes(struct block_device *bdev) { bdev->bd_writers--; } static void bdev_unblock_writes(struct block_device *bdev) { bdev->bd_writers++; } static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) { if (bdev_allow_write_mounted) return true; /* Writes blocked? */ if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) return false; if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) return false; return true; } static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) { if (bdev_allow_write_mounted) return; /* Claim exclusive or shared write access. */ if (mode & BLK_OPEN_RESTRICT_WRITES) bdev_block_writes(bdev); else if (mode & BLK_OPEN_WRITE) bdev->bd_writers++; } static inline bool bdev_unclaimed(const struct file *bdev_file) { return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); } static void bdev_yield_write_access(struct file *bdev_file) { struct block_device *bdev; if (bdev_allow_write_mounted) return; if (bdev_unclaimed(bdev_file)) return; bdev = file_bdev(bdev_file); if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) bdev_unblock_writes(bdev); else if (bdev_file->f_mode & FMODE_WRITE) bdev->bd_writers--; } /** * bdev_open - open a block device * @bdev: block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier * @hops: holder operations * @bdev_file: file for the block device * * Open the block device. If @holder is not %NULL, the block device is opened * with exclusive access. Exclusive opens may nest for the same @holder. * * CONTEXT: * Might sleep. * * RETURNS: * zero on success, -errno on failure. */ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file) { bool unblock_events = true; struct gendisk *disk = bdev->bd_disk; int ret; if (holder) { mode |= BLK_OPEN_EXCL; ret = bd_prepare_to_claim(bdev, holder, hops); if (ret) return ret; } else { if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) return -EIO; } disk_block_events(disk); mutex_lock(&disk->open_mutex); ret = -ENXIO; if (!disk_live(disk)) goto abort_claiming; if (!try_module_get(disk->fops->owner)) goto abort_claiming; ret = -EBUSY; if (!bdev_may_open(bdev, mode)) goto put_module; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); else ret = blkdev_get_whole(bdev, mode); if (ret) goto put_module; bdev_claim_write_access(bdev, mode); if (holder) { bd_finish_claiming(bdev, holder, hops); /* * Block event polling for write claims if requested. Any write * holder makes the write_holder state stick until all are * released. This is good enough and tracking individual * writeable reference is too fragile given the way @mode is * used in blkdev_get/put(). */ if ((mode & BLK_OPEN_WRITE) && !bdev_test_flag(bdev, BD_WRITE_HOLDER) && (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { bdev_set_flag(bdev, BD_WRITE_HOLDER); unblock_events = false; } } mutex_unlock(&disk->open_mutex); if (unblock_events) disk_unblock_events(disk); bdev_file->f_flags |= O_LARGEFILE; bdev_file->f_mode |= FMODE_CAN_ODIRECT; if (bdev_nowait(bdev)) bdev_file->f_mode |= FMODE_NOWAIT; if (mode & BLK_OPEN_RESTRICT_WRITES) bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; bdev_file->f_mapping = bdev->bd_mapping; bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); bdev_file->private_data = holder; return 0; put_module: module_put(disk->fops->owner); abort_claiming: if (holder) bd_abort_claiming(bdev, holder); mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); return ret; } /* * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk * associated with the floppy driver where it has allowed ioctls if the * file was opened for writing, but does not allow reads or writes. * Make sure that this quirk is reflected in @f_flags. * * It can also happen if a block device is opened as O_RDWR | O_WRONLY. */ static unsigned blk_to_file_flags(blk_mode_t mode) { unsigned int flags = 0; if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == (BLK_OPEN_READ | BLK_OPEN_WRITE)) flags |= O_RDWR; else if (mode & BLK_OPEN_WRITE_IOCTL) flags |= O_RDWR | O_WRONLY; else if (mode & BLK_OPEN_WRITE) flags |= O_WRONLY; else if (mode & BLK_OPEN_READ) flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ else WARN_ON_ONCE(true); if (mode & BLK_OPEN_NDELAY) flags |= O_NDELAY; return flags; } struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { struct file *bdev_file; struct block_device *bdev; unsigned int flags; int ret; ret = bdev_permission(dev, mode, holder); if (ret) return ERR_PTR(ret); bdev = blkdev_get_no_open(dev, true); if (!bdev) return ERR_PTR(-ENXIO); flags = blk_to_file_flags(mode); bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev), blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); if (IS_ERR(bdev_file)) { blkdev_put_no_open(bdev); return bdev_file; } ihold(BD_INODE(bdev)); ret = bdev_open(bdev, mode, holder, hops, bdev_file); if (ret) { /* We failed to open the block device. Let ->release() know. */ bdev_file->private_data = ERR_PTR(ret); fput(bdev_file); return ERR_PTR(ret); } return bdev_file; } EXPORT_SYMBOL(bdev_file_open_by_dev); struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { struct file *file; dev_t dev; int error; error = lookup_bdev(path, &dev); if (error) return ERR_PTR(error); file = bdev_file_open_by_dev(dev, mode, holder, hops); if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) { if (bdev_read_only(file_bdev(file))) { fput(file); file = ERR_PTR(-EACCES); } } return file; } EXPORT_SYMBOL(bdev_file_open_by_path); static inline void bd_yield_claim(struct file *bdev_file) { struct block_device *bdev = file_bdev(bdev_file); void *holder = bdev_file->private_data; lockdep_assert_held(&bdev->bd_disk->open_mutex); if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) return; if (!bdev_unclaimed(bdev_file)) bd_end_claim(bdev, holder); } void bdev_release(struct file *bdev_file) { struct block_device *bdev = file_bdev(bdev_file); void *holder = bdev_file->private_data; struct gendisk *disk = bdev->bd_disk; /* We failed to open that block device. */ if (IS_ERR(holder)) goto put_no_open; /* * Sync early if it looks like we're the last one. If someone else * opens the block device between now and the decrement of bd_openers * then we did a sync that we didn't need to, but that's not the end * of the world and we want to avoid long (could be several minute) * syncs while holding the mutex. */ if (atomic_read(&bdev->bd_openers) == 1) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); bdev_yield_write_access(bdev_file); if (holder) bd_yield_claim(bdev_file); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE * event. This is to ensure detection of media removal commanded * from userland - e.g. eject(1). */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); if (bdev_is_partition(bdev)) blkdev_put_part(bdev); else blkdev_put_whole(bdev); mutex_unlock(&disk->open_mutex); module_put(disk->fops->owner); put_no_open: blkdev_put_no_open(bdev); } /** * bdev_fput - yield claim to the block device and put the file * @bdev_file: open block device * * Yield claim on the block device and put the file. Ensure that the * block device can be reclaimed before the file is closed which is a * deferred operation. */ void bdev_fput(struct file *bdev_file) { if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) return; if (bdev_file->private_data) { struct block_device *bdev = file_bdev(bdev_file); struct gendisk *disk = bdev->bd_disk; mutex_lock(&disk->open_mutex); bdev_yield_write_access(bdev_file); bd_yield_claim(bdev_file); /* * Tell release we already gave up our hold on the * device and if write restrictions are available that * we already gave up write access to the device. */ bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); mutex_unlock(&disk->open_mutex); } fput(bdev_file); } EXPORT_SYMBOL(bdev_fput); /** * lookup_bdev() - Look up a struct block_device by name. * @pathname: Name of the block device in the filesystem. * @dev: Pointer to the block device's dev_t, if found. * * Lookup the block device's dev_t at @pathname in the current * namespace if possible and return it in @dev. * * Context: May sleep. * Return: 0 if succeeded, negative errno otherwise. */ int lookup_bdev(const char *pathname, dev_t *dev) { struct inode *inode; struct path path; int error; if (!pathname || !*pathname) return -EINVAL; error = kern_path(pathname, LOOKUP_FOLLOW, &path); if (error) return error; inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto out_path_put; error = -EACCES; if (!may_open_dev(&path)) goto out_path_put; *dev = inode->i_rdev; error = 0; out_path_put: path_put(&path); return error; } EXPORT_SYMBOL(lookup_bdev); /** * bdev_mark_dead - mark a block device as dead * @bdev: block device to operate on * @surprise: indicate a surprise removal * * Tell the file system that this devices or media is dead. If @surprise is set * to %true the device or media is already gone, if not we are preparing for an * orderly removal. * * This calls into the file system, which then typicall syncs out all dirty data * and writes back inodes and then invalidates any cached data in the inodes on * the file system. In addition we also invalidate the block device mapping. */ void bdev_mark_dead(struct block_device *bdev, bool surprise) { mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) bdev->bd_holder_ops->mark_dead(bdev, surprise); else { mutex_unlock(&bdev->bd_holder_lock); sync_blockdev(bdev); } invalidate_bdev(bdev); } /* * New drivers should not use this directly. There are some drivers however * that needs this for historical reasons. For example, the DASD driver has * historically had a shutdown to offline mode that doesn't actually remove the * gendisk that otherwise looks a lot like a safe device removal. */ EXPORT_SYMBOL_GPL(bdev_mark_dead); void sync_bdevs(bool wait) { struct inode *inode, *old_inode = NULL; spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; struct block_device *bdev; spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW) || mapping->nrpages == 0) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); old_inode = inode; bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); if (!atomic_read(&bdev->bd_openers)) { ; /* skip */ } else if (wait) { /* * We keep the error status of individual mapping so * that applications can catch the writeback error using * fsync(2). See filemap_fdatawait_keep_errors() for * details. */ filemap_fdatawait_keep_errors(inode->i_mapping); } else { filemap_fdatawrite(inode->i_mapping); } mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } /* * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. */ void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask) { struct block_device *bdev; /* * Note that d_backing_inode() returns the block device node inode, not * the block device's internal inode. Therefore it is *not* valid to * use I_BDEV() here; the block device has to be looked up by i_rdev * instead. */ bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false); if (!bdev) return; if (request_mask & STATX_DIOALIGN) { stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); stat->result_mask |= STATX_DIOALIGN; } if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) { struct request_queue *bd_queue = bdev->bd_queue; generic_fill_statx_atomic_writes(stat, queue_atomic_write_unit_min_bytes(bd_queue), queue_atomic_write_unit_max_bytes(bd_queue), 0); } stat->blksize = bdev_io_min(bdev); blkdev_put_no_open(bdev); } bool disk_live(struct gendisk *disk) { return !inode_unhashed(BD_INODE(disk->part0)); } EXPORT_SYMBOL_GPL(disk_live); unsigned int block_size(struct block_device *bdev) { return 1 << BD_INODE(bdev)->i_blkbits; } EXPORT_SYMBOL_GPL(block_size); static int __init setup_bdev_allow_write_mounted(char *str) { if (kstrtobool(str, &bdev_allow_write_mounted)) pr_warn("Invalid option string for bdev_allow_write_mounted:" " '%s'\n", str); return 1; } __setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); |
| 3511 245 246 245 19 244 245 245 244 19 245 216 245 128 128 127 128 126 128 128 111 104 118 119 10 10 10 118 78 118 127 6 129 107 27 18 27 25 1 24 24 22 24 67 13 54 10 1 186 62 216 216 199 215 93 212 228 216 216 229 229 13 216 40 216 2 216 212 214 107 107 107 214 31 81 81 80 81 81 8 75 9 74 54 17 66 69 67 8 8 8 6 3 1 3 186 187 187 185 187 187 186 186 186 46 46 46 46 31 31 186 186 186 8 7 6 5 5 5 4 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 | // SPDX-License-Identifier: GPL-2.0-only /* * mm/readahead.c - address_space-level file readahead. * * Copyright (C) 2002, Linus Torvalds * * 09Apr2002 Andrew Morton * Initial version. */ /** * DOC: Readahead Overview * * Readahead is used to read content into the page cache before it is * explicitly requested by the application. Readahead only ever * attempts to read folios that are not yet in the page cache. If a * folio is present but not up-to-date, readahead will not try to read * it. In that case a simple ->read_folio() will be requested. * * Readahead is triggered when an application read request (whether a * system call or a page fault) finds that the requested folio is not in * the page cache, or that it is in the page cache and has the * readahead flag set. This flag indicates that the folio was read * as part of a previous readahead request and now that it has been * accessed, it is time for the next readahead. * * Each readahead request is partly synchronous read, and partly async * readahead. This is reflected in the struct file_ra_state which * contains ->size being the total number of pages, and ->async_size * which is the number of pages in the async section. The readahead * flag will be set on the first folio in this async section to trigger * a subsequent readahead. Once a series of sequential reads has been * established, there should be no need for a synchronous component and * all readahead request will be fully asynchronous. * * When either of the triggers causes a readahead, three numbers need * to be determined: the start of the region to read, the size of the * region, and the size of the async tail. * * The start of the region is simply the first page address at or after * the accessed address, which is not currently populated in the page * cache. This is found with a simple search in the page cache. * * The size of the async tail is determined by subtracting the size that * was explicitly requested from the determined request size, unless * this would be less than zero - then zero is used. NOTE THIS * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY. * * The size of the region is normally determined from the size of the * previous readahead which loaded the preceding pages. This may be * discovered from the struct file_ra_state for simple sequential reads, * or from examining the state of the page cache when multiple * sequential reads are interleaved. Specifically: where the readahead * was triggered by the readahead flag, the size of the previous * readahead is assumed to be the number of pages from the triggering * page to the start of the new readahead. In these cases, the size of * the previous readahead is scaled, often doubled, for the new * readahead, though see get_next_ra_size() for details. * * If the size of the previous read cannot be determined, the number of * preceding pages in the page cache is used to estimate the size of * a previous read. This estimate could easily be misled by random * reads being coincidentally adjacent, so it is ignored unless it is * larger than the current request, and it is not scaled up, unless it * is at the start of file. * * In general readahead is accelerated at the start of the file, as * reads from there are often sequential. There are other minor * adjustments to the readahead size in various special cases and these * are best discovered by reading the code. * * The above calculation, based on the previous readahead size, * determines the size of the readahead, to which any requested read * size may be added. * * Readahead requests are sent to the filesystem using the ->readahead() * address space operation, for which mpage_readahead() is a canonical * implementation. ->readahead() should normally initiate reads on all * folios, but may fail to read any or all folios without causing an I/O * error. The page cache reading code will issue a ->read_folio() request * for any folio which ->readahead() did not read, and only an error * from this will be final. * * ->readahead() will generally call readahead_folio() repeatedly to get * each folio from those prepared for readahead. It may fail to read a * folio by: * * * not calling readahead_folio() sufficiently many times, effectively * ignoring some folios, as might be appropriate if the path to * storage is congested. * * * failing to actually submit a read request for a given folio, * possibly due to insufficient resources, or * * * getting an error during subsequent processing of a request. * * In the last two cases, the folio should be unlocked by the filesystem * to indicate that the read attempt has failed. In the first case the * folio will be unlocked by the VFS. * * Those folios not in the final ``async_size`` of the request should be * considered to be important and ->readahead() should not fail them due * to congestion or temporary resource unavailability, but should wait * for necessary resources (e.g. memory or indexing information) to * become available. Folios in the final ``async_size`` may be * considered less urgent and failure to read them is more acceptable. * In this case it is best to use filemap_remove_folio() to remove the * folios from the page cache as is automatically done for folios that * were not fetched with readahead_folio(). This will allow a * subsequent synchronous readahead request to try them again. If they * are left in the page cache, then they will be read individually using * ->read_folio() which may be less efficient. */ #include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagemap.h> #include <linux/psi.h> #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> #include <linux/sched/mm.h> #define CREATE_TRACE_POINTS #include <trace/events/readahead.h> #include "internal.h" /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. */ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; struct folio *folio; struct blk_plug plug; if (!readahead_count(rac)) return; if (unlikely(rac->_workingset)) psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { aops->readahead(rac); /* Clean up the remaining folios. */ while ((folio = readahead_folio(rac)) != NULL) { folio_get(folio); filemap_remove_folio(folio); folio_unlock(folio); folio_put(folio); } } else { while ((folio = readahead_folio(rac)) != NULL) aops->read_folio(rac->file, folio); } blk_finish_plug(&plug); if (unlikely(rac->_workingset)) psi_memstall_leave(&rac->_pflags); rac->_workingset = false; BUG_ON(readahead_count(rac)); } static struct folio *ractl_alloc_folio(struct readahead_control *ractl, gfp_t gfp_mask, unsigned int order) { struct folio *folio; folio = filemap_alloc_folio(gfp_mask, order); if (folio && ractl->dropbehind) __folio_set_dropbehind(folio); return folio; } /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. * @nr_to_read: The number of pages to read. * @lookahead_size: Where to start the next readahead. * * This function is for filesystems to call when they want to start * readahead beyond a file's stated i_size. This is almost certainly * not the function you want to call. Use page_cache_async_readahead() * or page_cache_sync_readahead() instead. * * Context: File is referenced by caller. Mutexes may be held by caller. * May sleep, but will not reenter filesystem to reclaim memory. */ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct address_space *mapping = ractl->mapping; unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long mark = ULONG_MAX, i = 0; unsigned int min_nrpages = mapping_min_folio_nrpages(mapping); /* * Partway through the readahead operation, we will have added * locked pages to the page cache, but will not yet have submitted * them for I/O. Adding another page may need to allocate memory, * which can trigger memory reclaim. Telling the VM we're in * the middle of a filesystem operation will cause it to not * touch file-backed pages, preventing a deadlock. Most (all?) * filesystems already specify __GFP_NOFS in their mapping's * gfp_mask, but let's be explicit here. */ unsigned int nofs = memalloc_nofs_save(); trace_page_cache_ra_unbounded(mapping->host, index, nr_to_read, lookahead_size); filemap_invalidate_lock_shared(mapping); index = mapping_align_index(mapping, index); /* * As iterator `i` is aligned to min_nrpages, round_up the * difference between nr_to_read and lookahead_size to mark the * index that only has lookahead or "async_region" to set the * readahead flag. */ if (lookahead_size <= nr_to_read) { unsigned long ra_folio_index; ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size, min_nrpages); mark = ra_folio_index - index; } nr_to_read += readahead_index(ractl) - index; ractl->_index = index; /* * Preallocate as many pages as we will need. */ while (i < nr_to_read) { struct folio *folio = xa_load(&mapping->i_pages, index + i); int ret; if (folio && !xa_is_value(folio)) { /* * Page already present? Kick off the current batch * of contiguous pages before continuing with the * next batch. This page may be the one we would * have intended to mark as Readahead, but we don't * have a stable reference to this page, and it's * not worth getting one just for that. */ read_pages(ractl); ractl->_index += min_nrpages; i = ractl->_index + ractl->_nr_pages - index; continue; } folio = ractl_alloc_folio(ractl, gfp_mask, mapping_min_folio_order(mapping)); if (!folio) break; ret = filemap_add_folio(mapping, folio, index + i, gfp_mask); if (ret < 0) { folio_put(folio); if (ret == -ENOMEM) break; read_pages(ractl); ractl->_index += min_nrpages; i = ractl->_index + ractl->_nr_pages - index; continue; } if (i == mark) folio_set_readahead(folio); ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages += min_nrpages; i += min_nrpages; } /* * Now start the IO. We ignore I/O errors - if the folio is not * uptodate then the caller will launch read_folio again, and * will then handle the error. */ read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); /* * do_page_cache_ra() actually reads a chunk of disk. It allocates * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ static void do_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = ractl->mapping->host; unsigned long index = readahead_index(ractl); loff_t isize = i_size_read(inode); pgoff_t end_index; /* The last page we want to read */ if (isize == 0) return; end_index = (isize - 1) >> PAGE_SHIFT; if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages; if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead)) return; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); nr_to_read = min_t(unsigned long, nr_to_read, max_pages); while (nr_to_read) { unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; do_page_cache_ra(ractl, this_chunk, 0); nr_to_read -= this_chunk; } } /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial */ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); if (newsize <= max / 32) newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2; else newsize = max; return newsize; } /* * Get the previous window size, ramp it up, and * return it as the new window size. */ static unsigned long get_next_ra_size(struct file_ra_state *ra, unsigned long max) { unsigned long cur = ra->size; if (cur < max / 16) return 4 * cur; if (cur <= max / 2) return 2 * cur; return max; } /* * On-demand readahead design. * * The fields in struct file_ra_state represent the most-recently-executed * readahead attempt: * * |<----- async_size ---------| * |------------------- size -------------------->| * |==================#===========================| * ^start ^page marked with PG_readahead * * To overlap application thinking time and disk I/O time, we do * `readahead pipelining': Do not wait until the application consumed all * readahead pages and stalled on the missing page at readahead_index; * Instead, submit an asynchronous readahead I/O as soon as there are * only async_size pages left in the readahead window. Normally async_size * will be equal to size, for maximum pipelining. * * In interleaved sequential reads, concurrent streams on the same fd can * be invalidating each other's readahead state. So we flag the new readahead * page at (start+size-async_size) with PG_readahead, and use it as readahead * indicator. The flag won't be set on already cached pages, to avoid the * readahead-for-nothing fuss, saving pointless page cache lookups. * * prev_pos tracks the last visited byte in the _previous_ read request. * It should be maintained by the caller, and will be used for detecting * small random reads. Note that the readahead algorithm checks loosely * for sequential patterns. Hence interleaved reads might be served as * sequential ones. * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear * read is about to happen and the window is immediately set to the initial size * based on I/O request size and the max_readahead. * * The code ramps up the readahead size aggressively at first, but slow down as * it approaches max_readhead. */ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; struct folio *folio = ractl_alloc_folio(ractl, gfp, order); if (!folio) return -ENOMEM; mark = round_down(mark, 1UL << order); if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); if (err) { folio_put(folio); return err; } ractl->_nr_pages += 1UL << order; ractl->_workingset |= folio_test_workingset(folio); return 0; } void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra) { struct address_space *mapping = ractl->mapping; pgoff_t start = readahead_index(ractl); pgoff_t index = start; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); unsigned int new_order = ra->order; trace_page_cache_ra_order(mapping->host, start, ra); if (!mapping_large_folio_support(mapping)) { ra->order = 0; goto fallback; } limit = min(limit, index + ra->size - 1); new_order = min(mapping_max_folio_order(mapping), new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); new_order = max(new_order, min_order); ra->order = new_order; /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); /* * If the new_order is greater than min_order and index is * already aligned to new_order, then this will be noop as index * aligned to new_order should also be aligned to min_order. */ ractl->_index = mapping_align_index(mapping, index); index = readahead_index(ractl); while (index <= limit) { unsigned int order = new_order; /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) order = __ffs(index); /* Don't allocate pages past EOF */ while (order > min_order && index + (1UL << order) - 1 > limit) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; index += 1UL << order; } read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this * situation below. */ if (!err) return; fallback: /* * ->readahead() may have updated readahead window size so we have to * check there's still something to read. */ if (ra->size > index - start) do_page_cache_ra(ractl, ra->size - (index - start), ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); unsigned long max_pages = ractl->ra->ra_pages; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ if (req_size > max_pages && bdi->io_pages > max_pages) max_pages = min(req_size, bdi->io_pages); return max_pages; } void page_cache_sync_ra(struct readahead_control *ractl, unsigned long req_count) { pgoff_t index = readahead_index(ractl); bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); struct file_ra_state *ra = ractl->ra; unsigned long max_pages, contig_count; pgoff_t prev_index, miss; trace_page_cache_sync_ra(ractl->mapping->host, index, ra, req_count); /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced * readahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ if (!ra->ra_pages || blk_cgroup_congested()) { if (!ractl->file) return; req_count = 1; do_forced_ra = true; } /* be dumb */ if (do_forced_ra) { force_page_cache_ra(ractl, req_count); return; } max_pages = ractl_max_pages(ractl, req_count); prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; /* * A start of file, oversized read, or sequential cache miss: * trivial case: (index - prev_index) == 1 * unaligned reads: (index - prev_index) == 0 */ if (!index || req_count > max_pages || index - prev_index <= 1UL) { ra->start = index; ra->size = get_init_ra_size(req_count, max_pages); ra->async_size = ra->size > req_count ? ra->size - req_count : ra->size >> 1; goto readit; } /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ rcu_read_lock(); miss = page_cache_prev_miss(ractl->mapping, index - 1, max_pages); rcu_read_unlock(); contig_count = index - miss - 1; /* * Standalone, small random read. Read as is, and do not pollute the * readahead state. */ if (contig_count <= req_count) { do_page_cache_ra(ractl, req_count, 0); return; } /* * File cached from the beginning: * it is a strong indication of long-run stream (or whole-file-read) */ if (miss == ULONG_MAX) contig_count *= 2; ra->start = index; ra->size = min(contig_count + req_count, max_pages); ra->async_size = 1; readit: ra->order = 0; ractl->_index = ra->start; page_cache_ra_order(ractl, ra); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, struct folio *folio, unsigned long req_count) { unsigned long max_pages; struct file_ra_state *ra = ractl->ra; pgoff_t index = readahead_index(ractl); pgoff_t expected, start, end, aligned_end, align; /* no readahead */ if (!ra->ra_pages) return; /* * Same bit is used for PG_readahead and PG_reclaim. */ if (folio_test_writeback(folio)) return; trace_page_cache_async_ra(ractl->mapping->host, index, ra, req_count); folio_clear_readahead(folio); if (blk_cgroup_congested()) return; max_pages = ractl_max_pages(ractl, req_count); /* * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ expected = round_down(ra->start + ra->size - ra->async_size, folio_nr_pages(folio)); if (index == expected) { ra->start += ra->size; /* * In the case of MADV_HUGEPAGE, the actual size might exceed * the readahead window. */ ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); goto readit; } /* * Hit a marked folio without valid readahead state. * E.g. interleaved reads. * Query the pagecache for async_size, which normally equals to * readahead size. Ramp it up and use it as the new readahead size. */ rcu_read_lock(); start = page_cache_next_miss(ractl->mapping, index + 1, max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) return; ra->start = start; ra->size = start - index; /* old async_size */ ra->size += req_count; ra->size = get_next_ra_size(ra, max_pages); readit: ra->order += 2; align = 1UL << min(ra->order, ffs(max_pages) - 1); end = ra->start + ra->size; aligned_end = round_down(end, align); if (aligned_end > ra->start) ra->size -= end - aligned_end; ra->async_size = ra->size; ractl->_index = ra->start; page_cache_ra_order(ractl, ra); } EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { struct file *file; const struct inode *inode; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; file = fd_file(f); if (!(file->f_mode & FMODE_READ)) return -EBADF; /* * The readahead() syscall is intended to run only on files * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ if (!file->f_mapping) return -EINVAL; if (!file->f_mapping->a_ops) return -EINVAL; inode = file_inode(file); if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -EINVAL; if (IS_ANON_FILE(inode)) return -EINVAL; return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); } SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { return ksys_readahead(fd, offset, count); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD) COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count) { return ksys_readahead(fd, compat_arg_u64_glue(offset), count); } #endif /** * readahead_expand - Expand a readahead request * @ractl: The request to be expanded * @new_start: The revised start * @new_len: The revised size of the request * * Attempt to expand a readahead request outwards from the current size to the * specified size by inserting locked pages before and after the current window * to increase the size to the new window. This may involve the insertion of * THPs, in which case the window may get expanded even beyond what was * requested. * * The algorithm will stop if it encounters a conflicting page already in the * pagecache and leave a smaller expansion than requested. * * The caller must check for this by examining the revised @ractl object for a * different expansion than was requested. */ void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; pgoff_t new_index, new_nr_pages; gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long min_nrpages = mapping_min_folio_nrpages(mapping); unsigned int min_order = mapping_min_folio_order(mapping); new_index = new_start / PAGE_SIZE; /* * Readahead code should have aligned the ractl->_index to * min_nrpages before calling readahead aops. */ VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages)); /* Expand the leading edge downwards */ while (ractl->_index > new_index) { unsigned long index = ractl->_index - 1; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; index = mapping_align_index(mapping, index); if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages += min_nrpages; ractl->_index = folio->index; } new_len += new_start - readahead_pos(ractl); new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); /* Expand the trailing edge upwards */ while (ractl->_nr_pages < new_nr_pages) { unsigned long index = ractl->_index + ractl->_nr_pages; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; index = mapping_align_index(mapping, index); if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages += min_nrpages; if (ra) { ra->size += min_nrpages; ra->async_size += min_nrpages; } } } EXPORT_SYMBOL(readahead_expand); |
| 318 319 319 9 9 9 400 401 396 403 399 402 401 400 401 400 401 320 319 320 320 401 3 3 3 3 2 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 | // SPDX-License-Identifier: GPL-2.0-only /* * umh - the kernel usermode helper */ #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/binfmts.h> #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> #include <linux/fs_struct.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/resource.h> #include <linux/notifier.h> #include <linux/suspend.h> #include <linux/rwsem.h> #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> #include <linux/initrd.h> #include <linux/freezer.h> #include <trace/events/module.h> static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) (*info->cleanup)(info); kfree(info); } static void umh_complete(struct subprocess_info *sub_info) { struct completion *comp = xchg(&sub_info->complete, NULL); /* * See call_usermodehelper_exec(). If xchg() returns NULL * we own sub_info, the UMH_KILLABLE caller has gone away * or the caller used UMH_NO_WAIT. */ if (comp) complete(comp); else call_usermodehelper_freeinfo(sub_info); } /* * This is the task which runs the usermode application */ static int call_usermodehelper_exec_async(void *data) { struct subprocess_info *sub_info = data; struct cred *new; int retval; spin_lock_irq(¤t->sighand->siglock); flush_signal_handlers(current, 1); spin_unlock_irq(¤t->sighand->siglock); /* * Initial kernel threads share ther FS with init, in order to * get the init root directory. But we've now created a new * thread that is going to execve a user process and has its own * 'struct fs_struct'. Reset umask to the default. */ current->fs->umask = 0022; /* * Our parent (unbound workqueue) runs with elevated scheduling * priority. Avoid propagating that into the userspace child. */ set_user_nice(current, 0); retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); new->cap_inheritable = cap_intersect(usermodehelper_inheritable, new->cap_inheritable); spin_unlock(&umh_sysctl_lock); if (sub_info->init) { retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); goto out; } } commit_creds(new); wait_for_initramfs(); retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); out: sub_info->retval = retval; /* * call_usermodehelper_exec_sync() will call umh_complete * if UHM_WAIT_PROC. */ if (!(sub_info->wait & UMH_WAIT_PROC)) umh_complete(sub_info); if (!retval) return 0; do_exit(0); } /* Handles UMH_WAIT_PROC. */ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) sub_info->retval = pid; else kernel_wait(pid, &sub_info->retval); /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); umh_complete(sub_info); } /* * We need to create the usermodehelper kernel thread from a task that is affine * to an optimized set of CPUs (or nohz housekeeping ones) such that they * inherit a widest affinity irrespective of call_usermodehelper() callers with * possibly reduced affinity (eg: per-cpu workqueues). We don't want * usermodehelper targets to contend a busy CPU. * * Unbound workqueues provide such wide affinity and allow to block on * UMH_WAIT_PROC requests without blocking pending request (up to some limit). * * Besides, workqueues provide the privilege level that caller might not have * to perform the usermodehelper request. * */ static void call_usermodehelper_exec_work(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); if (sub_info->wait & UMH_WAIT_PROC) { call_usermodehelper_exec_sync(sub_info); } else { pid_t pid; /* * Use CLONE_PARENT to reparent it to kthreadd; we do not * want to pollute current->children, and we need a parent * that always ignores SIGCHLD to ensure auto-reaping. */ pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, CLONE_PARENT | SIGCHLD); if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); } } } /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user * land has been frozen during a system-wide hibernation or suspend operation). * Should always be manipulated under umhelper_sem acquired for write. */ static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; /* Number of helpers running */ static atomic_t running_helpers = ATOMIC_INIT(0); /* * Wait queue head used by usermodehelper_disable() to wait for all running * helpers to finish. */ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); /* * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled * to become 'false'. */ static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); /* * Time to wait for running_helpers to become zero before the setting of * usermodehelper_disabled in usermodehelper_disable() fails */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) int usermodehelper_read_trylock(void) { DEFINE_WAIT(wait); int ret = 0; down_read(&umhelper_sem); for (;;) { prepare_to_wait(&usermodehelper_disabled_waitq, &wait, TASK_INTERRUPTIBLE); if (!usermodehelper_disabled) break; if (usermodehelper_disabled == UMH_DISABLED) ret = -EAGAIN; up_read(&umhelper_sem); if (ret) break; schedule(); try_to_freeze(); down_read(&umhelper_sem); } finish_wait(&usermodehelper_disabled_waitq, &wait); return ret; } EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); long usermodehelper_read_lock_wait(long timeout) { DEFINE_WAIT(wait); if (timeout < 0) return -EINVAL; down_read(&umhelper_sem); for (;;) { prepare_to_wait(&usermodehelper_disabled_waitq, &wait, TASK_UNINTERRUPTIBLE); if (!usermodehelper_disabled) break; up_read(&umhelper_sem); timeout = schedule_timeout(timeout); if (!timeout) break; down_read(&umhelper_sem); } finish_wait(&usermodehelper_disabled_waitq, &wait); return timeout; } EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); void usermodehelper_read_unlock(void) { up_read(&umhelper_sem); } EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); /** * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. * @depth: New value to assign to usermodehelper_disabled. * * Change the value of usermodehelper_disabled (under umhelper_sem locked for * writing) and wakeup tasks waiting for it to change. */ void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) { down_write(&umhelper_sem); usermodehelper_disabled = depth; wake_up(&usermodehelper_disabled_waitq); up_write(&umhelper_sem); } /** * __usermodehelper_disable - Prevent new helpers from being started. * @depth: New value to assign to usermodehelper_disabled. * * Set usermodehelper_disabled to @depth and wait for running helpers to exit. */ int __usermodehelper_disable(enum umh_disable_depth depth) { long retval; if (!depth) return -EINVAL; down_write(&umhelper_sem); usermodehelper_disabled = depth; up_write(&umhelper_sem); /* * From now on call_usermodehelper_exec() won't start any new * helpers, so it is sufficient if running_helpers turns out to * be zero at one point (it may be increased later, but that * doesn't matter). */ retval = wait_event_timeout(running_helpers_waitq, atomic_read(&running_helpers) == 0, RUNNING_HELPERS_TIMEOUT); if (retval) return 0; __usermodehelper_set_disable_depth(UMH_ENABLED); return -EAGAIN; } static void helper_lock(void) { atomic_inc(&running_helpers); smp_mb__after_atomic(); } static void helper_unlock(void) { if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } /** * call_usermodehelper_setup - prepare to call a usermode helper * @path: path to usermode executable * @argv: arg vector for process * @envp: environment for process * @gfp_mask: gfp mask for memory allocation * @init: an init function * @cleanup: a cleanup function * @data: arbitrary context sensitive data * * Returns either %NULL on allocation failure, or a subprocess_info * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. * * The init function is used to customize the helper process prior to * exec. A non-zero return code causes the process to error out, exit, * and return the failure to the calling process * * The cleanup function is just before the subprocess_info is about to * be freed. This can be used for freeing the argv and envp. The * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, char **envp, gfp_t gfp_mask, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), void *data) { struct subprocess_info *sub_info; sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); if (!sub_info) goto out; INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); #ifdef CONFIG_STATIC_USERMODEHELPER sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; #else sub_info->path = path; #endif sub_info->argv = argv; sub_info->envp = envp; sub_info->cleanup = cleanup; sub_info->init = init; sub_info->data = data; out: return sub_info; } EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocess * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of system workqueues. * (ie. it runs with full root capabilities and optimized affinity). * * Note: successful return value does not guarantee the helper was called at * all. You can't rely on sub_info->{init,cleanup} being called even for * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers * into a successful no-op. */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { unsigned int state = TASK_UNINTERRUPTIBLE; DECLARE_COMPLETION_ONSTACK(done); int retval = 0; if (!sub_info->path) { call_usermodehelper_freeinfo(sub_info); return -EINVAL; } helper_lock(); if (usermodehelper_disabled) { retval = -EBUSY; goto out; } /* * If there is no binary for us to call, then just return and get out of * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and * disable all call_usermodehelper() calls. */ if (strlen(sub_info->path) == 0) goto out; /* * Set the completion pointer only if there is a waiter. * This makes it possible to use umh_complete to free * the data structure in case of UMH_NO_WAIT. */ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(system_unbound_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; if (wait & UMH_FREEZABLE) state |= TASK_FREEZABLE; if (wait & UMH_KILLABLE) { retval = wait_for_completion_state(&done, state | TASK_KILLABLE); if (!retval) goto wait_done; /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; /* * fallthrough; in case of -ERESTARTSYS now do uninterruptible * wait_for_completion_state(). Since umh_complete() shall call * complete() in a moment if xchg() above returned NULL, this * uninterruptible wait_for_completion_state() will not block * SIGKILL'ed processes for long. */ } wait_for_completion_state(&done, state); wait_done: retval = sub_info->retval; out: call_usermodehelper_freeinfo(sub_info); unlock: helper_unlock(); return retval; } EXPORT_SYMBOL(call_usermodehelper_exec); /** * call_usermodehelper() - prepare and start a usermode application * @path: path to usermode executable * @argv: arg vector for process * @envp: environment for process * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * This function is the equivalent to use call_usermodehelper_setup() and * call_usermodehelper_exec(). */ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; info = call_usermodehelper_setup(path, argv, envp, gfp_mask, NULL, NULL, NULL); if (info == NULL) return -ENOMEM; return call_usermodehelper_exec(info, wait); } EXPORT_SYMBOL(call_usermodehelper); #if defined(CONFIG_SYSCTL) static int proc_cap_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[2]; kernel_cap_t new_cap, *cap; int err; if (write && (!capable(CAP_SETPCAP) || !capable(CAP_SYS_MODULE))) return -EPERM; /* * convert from the global kernel_cap_t to the ulong array to print to * userspace if this is a read. * * Legacy format: capabilities are exposed as two 32-bit values */ cap = table->data; spin_lock(&umh_sysctl_lock); cap_array[0] = (u32) cap->val; cap_array[1] = cap->val >> 32; spin_unlock(&umh_sysctl_lock); t = *table; t.data = &cap_array; /* * actually read or write and array of ulongs from userspace. Remember * these are least significant 32 bits first */ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; new_cap.val = (u32)cap_array[0]; new_cap.val += (u64)cap_array[1] << 32; /* * Drop everything not in the new_cap (but don't add things) */ if (write) { spin_lock(&umh_sysctl_lock); *cap = cap_intersect(*cap, new_cap); spin_unlock(&umh_sysctl_lock); } return 0; } static const struct ctl_table usermodehelper_table[] = { { .procname = "bset", .data = &usermodehelper_bset, .maxlen = 2 * sizeof(unsigned long), .mode = 0600, .proc_handler = proc_cap_handler, }, { .procname = "inheritable", .data = &usermodehelper_inheritable, .maxlen = 2 * sizeof(unsigned long), .mode = 0600, .proc_handler = proc_cap_handler, }, }; static int __init init_umh_sysctls(void) { register_sysctl_init("kernel/usermodehelper", usermodehelper_table); return 0; } early_initcall(init_umh_sysctls); #endif /* CONFIG_SYSCTL */ |
| 3218 3218 339 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 | /* * linux/include/linux/console.h * * Copyright (C) 1993 Hamish Macdonald * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. * * Changed: * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX */ #ifndef _LINUX_CONSOLE_H_ #define _LINUX_CONSOLE_H_ 1 #include <linux/atomic.h> #include <linux/bits.h> #include <linux/irq_work.h> #include <linux/rculist.h> #include <linux/rcuwait.h> #include <linux/types.h> #include <linux/vesa.h> struct vc_data; struct console_font_op; struct console_font; struct module; struct tty_struct; struct notifier_block; enum con_scroll { SM_UP, SM_DOWN, }; enum vc_intensity; /** * struct consw - callbacks for consoles * * @owner: the module to get references of when this console is used * @con_startup: set up the console and return its name (like VGA, EGA, ...) * @con_init: initialize the console on @vc. @init is true for the very first * call on this @vc. * @con_deinit: deinitialize the console from @vc. * @con_clear: erase @count characters at [@x, @y] on @vc. @count >= 1. * @con_putc: emit one character with attributes @ca to [@x, @y] on @vc. * (optional -- @con_putcs would be called instead) * @con_putcs: emit @count characters with attributes @s to [@x, @y] on @vc. * @con_cursor: enable/disable cursor depending on @enable * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. * @con_switch: notifier about the console switch; it is supposed to return * true if a redraw is needed. * @con_blank: blank/unblank the console. The target mode is passed in @blank. * @mode_switch is set if changing from/to text/graphics. The hook * is supposed to return true if a redraw is needed. * @con_font_set: set console @vc font to @font with height @vpitch. @flags can * be %KD_FONT_FLAG_DONT_RECALC. (optional) * @con_font_get: fetch the current font on @vc of height @vpitch into @font. * (optional) * @con_font_default: set default font on @vc. @name can be %NULL or font name * to search for. @font can be filled back. (optional) * @con_resize: resize the @vc console to @width x @height. @from_user is true * when this change comes from the user space. * @con_set_palette: sets the palette of the console @vc to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not * provided or returns false, the origin is set to * @vc->vc_screenbuf. (optional) * @con_save_screen: save screen content into @vc->vc_screenbuf. Called e.g. * upon entering graphics. (optional) * @con_build_attr: build attributes based on @color, @intensity and other * parameters. The result is used for both normal and erase * characters. (optional) * @con_invert_region: invert a region of length @count on @vc starting at @p. * (optional) * @con_debug_enter: prepare the console for the debugger. This includes, but * is not limited to, unblanking the console, loading an * appropriate palette, and allowing debugger generated output. * (optional) * @con_debug_leave: restore the console to its pre-debug state as closely as * possible. (optional) */ struct consw { struct module *owner; const char *(*con_startup)(void); void (*con_init)(struct vc_data *vc, bool init); void (*con_deinit)(struct vc_data *vc); void (*con_clear)(struct vc_data *vc, unsigned int y, unsigned int x, unsigned int count); void (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y, unsigned int x); void (*con_putcs)(struct vc_data *vc, const u16 *s, unsigned int count, unsigned int ypos, unsigned int xpos); void (*con_cursor)(struct vc_data *vc, bool enable); bool (*con_scroll)(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int lines); bool (*con_switch)(struct vc_data *vc); bool (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank, bool mode_switch); int (*con_font_set)(struct vc_data *vc, const struct console_font *font, unsigned int vpitch, unsigned int flags); int (*con_font_get)(struct vc_data *vc, struct console_font *font, unsigned int vpitch); int (*con_font_default)(struct vc_data *vc, struct console_font *font, const char *name); int (*con_resize)(struct vc_data *vc, unsigned int width, unsigned int height, bool from_user); void (*con_set_palette)(struct vc_data *vc, const unsigned char *table); void (*con_scrolldelta)(struct vc_data *vc, int lines); bool (*con_set_origin)(struct vc_data *vc); void (*con_save_screen)(struct vc_data *vc); u8 (*con_build_attr)(struct vc_data *vc, u8 color, enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); void (*con_debug_enter)(struct vc_data *vc); void (*con_debug_leave)(struct vc_data *vc); }; extern const struct consw *conswitchp; extern const struct consw dummy_con; /* dummy console buffer */ extern const struct consw vga_con; /* VGA text console */ extern const struct consw newport_con; /* SGI Newport console */ struct screen_info; #ifdef CONFIG_VGA_CONSOLE void vgacon_register_screen(struct screen_info *si); #else static inline void vgacon_register_screen(struct screen_info *si) { } #endif int con_is_bound(const struct consw *csw); int do_unregister_con_driver(const struct consw *csw); int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); #ifdef CONFIG_VT void con_debug_enter(struct vc_data *vc); void con_debug_leave(void); #else static inline void con_debug_enter(struct vc_data *vc) { } static inline void con_debug_leave(void) { } #endif /* * The interface for a console, or any other device that wants to capture * console messages (printer driver?) */ /** * enum cons_flags - General console flags * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate * output of messages that were already shown by boot * consoles or read by userspace via syslog() syscall. * @CON_CONSDEV: Indicates that the console driver is backing * /dev/console. * @CON_ENABLED: Indicates if a console is allowed to print records. If * false, the console also will not advance to later * records. * @CON_BOOT: Marks the console driver as early console driver which * is used during boot before the real driver becomes * available. It will be automatically unregistered * when the real console driver is registered unless * "keep_bootcon" parameter is used. * @CON_ANYTIME: A misnomed historical flag which tells the core code * that the legacy @console::write callback can be invoked * on a CPU which is marked OFFLINE. That is misleading as * it suggests that there is no contextual limit for * invoking the callback. The original motivation was * readiness of the per-CPU areas. * @CON_BRL: Indicates a braille device which is exempt from * receiving the printk spam for obvious reasons. * @CON_EXTENDED: The console supports the extended output format of * /dev/kmesg which requires a larger output buffer. * @CON_SUSPENDED: Indicates if a console is suspended. If true, the * printing callbacks must not be called. * @CON_NBCON: Console can operate outside of the legacy style console_lock * constraints. */ enum cons_flags { CON_PRINTBUFFER = BIT(0), CON_CONSDEV = BIT(1), CON_ENABLED = BIT(2), CON_BOOT = BIT(3), CON_ANYTIME = BIT(4), CON_BRL = BIT(5), CON_EXTENDED = BIT(6), CON_SUSPENDED = BIT(7), CON_NBCON = BIT(8), }; /** * struct nbcon_state - console state for nbcon consoles * @atom: Compound of the state fields for atomic operations * * @req_prio: The priority of a handover request * @prio: The priority of the current owner * @unsafe: Console is busy in a non takeover region * @unsafe_takeover: A hostile takeover in an unsafe state happened in the * past. The console cannot be safe until re-initialized. * @cpu: The CPU on which the owner runs * * To be used for reading and preparing of the value stored in the nbcon * state variable @console::nbcon_state. * * The @prio and @req_prio fields are particularly important to allow * spin-waiting to timeout and give up without the risk of a waiter being * assigned the lock after giving up. */ struct nbcon_state { union { unsigned int atom; struct { unsigned int prio : 2; unsigned int req_prio : 2; unsigned int unsafe : 1; unsigned int unsafe_takeover : 1; unsigned int cpu : 24; }; }; }; /* * The nbcon_state struct is used to easily create and interpret values that * are stored in the @console::nbcon_state variable. Ensure this struct stays * within the size boundaries of the atomic variable's underlying type in * order to avoid any accidental truncation. */ static_assert(sizeof(struct nbcon_state) <= sizeof(int)); /** * enum nbcon_prio - console owner priority for nbcon consoles * @NBCON_PRIO_NONE: Unused * @NBCON_PRIO_NORMAL: Normal (non-emergency) usage * @NBCON_PRIO_EMERGENCY: Emergency output (WARN/OOPS...) * @NBCON_PRIO_PANIC: Panic output * @NBCON_PRIO_MAX: The number of priority levels * * A higher priority context can takeover the console when it is * in the safe state. The final attempt to flush consoles in panic() * can be allowed to do so even in an unsafe state (Hope and pray). */ enum nbcon_prio { NBCON_PRIO_NONE = 0, NBCON_PRIO_NORMAL, NBCON_PRIO_EMERGENCY, NBCON_PRIO_PANIC, NBCON_PRIO_MAX, }; struct console; struct printk_buffers; /** * struct nbcon_context - Context for console acquire/release * @console: The associated console * @spinwait_max_us: Limit for spin-wait acquire * @prio: Priority of the context * @allow_unsafe_takeover: Allow performing takeover even if unsafe. Can * be used only with NBCON_PRIO_PANIC @prio. It * might cause a system freeze when the console * is used later. * @backlog: Ringbuffer has pending records * @pbufs: Pointer to the text buffer for this context * @seq: The sequence number to print for this context */ struct nbcon_context { /* members set by caller */ struct console *console; unsigned int spinwait_max_us; enum nbcon_prio prio; unsigned int allow_unsafe_takeover : 1; /* members set by emit */ unsigned int backlog : 1; /* members set by acquire */ struct printk_buffers *pbufs; u64 seq; }; /** * struct nbcon_write_context - Context handed to the nbcon write callbacks * @ctxt: The core console context * @outbuf: Pointer to the text buffer for output * @len: Length to write * @unsafe_takeover: If a hostile takeover in an unsafe state has occurred */ struct nbcon_write_context { struct nbcon_context __private ctxt; char *outbuf; unsigned int len; bool unsafe_takeover; }; /** * struct console - The console descriptor structure * @name: The name of the console driver * @write: Legacy write callback to output messages (Optional) * @read: Read callback for console input (Optional) * @device: The underlying TTY device driver (Optional) * @unblank: Callback to unblank the console (Optional) * @setup: Callback for initializing the console (Optional) * @exit: Callback for teardown of the console (Optional) * @match: Callback for matching a console (Optional) * @flags: Console flags. See enum cons_flags * @index: Console index, e.g. port number * @cflag: TTY control mode flags * @ispeed: TTY input speed * @ospeed: TTY output speed * @seq: Sequence number of the next ringbuffer record to print * @dropped: Number of unreported dropped ringbuffer records * @data: Driver private data * @node: hlist node for the console list * * @nbcon_state: State for nbcon consoles * @nbcon_seq: Sequence number of the next record for nbcon to print * @nbcon_device_ctxt: Context available for non-printing operations * @nbcon_prev_seq: Seq num the previous nbcon owner was assigned to print * @pbufs: Pointer to nbcon private buffer * @kthread: Printer kthread for this console * @rcuwait: RCU-safe wait object for @kthread waking * @irq_work: Defer @kthread waking to IRQ work context */ struct console { char name[16]; void (*write)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *co, char *s, unsigned int count); struct tty_driver *(*device)(struct console *co, int *index); void (*unblank)(void); int (*setup)(struct console *co, char *options); int (*exit)(struct console *co); int (*match)(struct console *co, char *name, int idx, char *options); short flags; short index; int cflag; uint ispeed; uint ospeed; u64 seq; unsigned long dropped; void *data; struct hlist_node node; /* nbcon console specific members */ /** * @write_atomic: * * NBCON callback to write out text in any context. (Optional) * * This callback is called with the console already acquired. However, * a higher priority context is allowed to take it over by default. * * The callback must call nbcon_enter_unsafe() and nbcon_exit_unsafe() * around any code where the takeover is not safe, for example, when * manipulating the serial port registers. * * nbcon_enter_unsafe() will fail if the context has lost the console * ownership in the meantime. In this case, the callback is no longer * allowed to go forward. It must back out immediately and carefully. * The buffer content is also no longer trusted since it no longer * belongs to the context. * * The callback should allow the takeover whenever it is safe. It * increases the chance to see messages when the system is in trouble. * If the driver must reacquire ownership in order to finalize or * revert hardware changes, nbcon_reacquire_nobuf() can be used. * However, on reacquire the buffer content is no longer available. A * reacquire cannot be used to resume printing. * * The callback can be called from any context (including NMI). * Therefore it must avoid usage of any locking and instead rely * on the console ownership for synchronization. */ void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); /** * @write_thread: * * NBCON callback to write out text in task context. * * This callback must be called only in task context with both * device_lock() and the nbcon console acquired with * NBCON_PRIO_NORMAL. * * The same rules for console ownership verification and unsafe * sections handling applies as with write_atomic(). * * The console ownership handling is necessary for synchronization * against write_atomic() which is synchronized only via the context. * * The device_lock() provides the primary serialization for operations * on the device. It might be as relaxed (mutex)[*] or as tight * (disabled preemption and interrupts) as needed. It allows * the kthread to operate in the least restrictive mode[**]. * * [*] Standalone nbcon_context_try_acquire() is not safe with * the preemption enabled, see nbcon_owner_matches(). But it * can be safe when always called in the preemptive context * under the device_lock(). * * [**] The device_lock() makes sure that nbcon_context_try_acquire() * would never need to spin which is important especially with * PREEMPT_RT. */ void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt); /** * @device_lock: * * NBCON callback to begin synchronization with driver code. * * Console drivers typically must deal with access to the hardware * via user input/output (such as an interactive login shell) and * output of kernel messages via printk() calls. This callback is * called by the printk-subsystem whenever it needs to synchronize * with hardware access by the driver. It should be implemented to * use whatever synchronization mechanism the driver is using for * itself (for example, the port lock for uart serial consoles). * * The callback is always called from task context. It may use any * synchronization method required by the driver. * * IMPORTANT: The callback MUST disable migration. The console driver * may be using a synchronization mechanism that already takes * care of this (such as spinlocks). Otherwise this function must * explicitly call migrate_disable(). * * The flags argument is provided as a convenience to the driver. It * will be passed again to device_unlock(). It can be ignored if the * driver does not need it. */ void (*device_lock)(struct console *con, unsigned long *flags); /** * @device_unlock: * * NBCON callback to finish synchronization with driver code. * * It is the counterpart to device_lock(). * * This callback is always called from task context. It must * appropriately re-enable migration (depending on how device_lock() * disabled migration). * * The flags argument is the value of the same variable that was * passed to device_lock(). */ void (*device_unlock)(struct console *con, unsigned long flags); atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; struct nbcon_context __private nbcon_device_ctxt; atomic_long_t __private nbcon_prev_seq; struct printk_buffers *pbufs; struct task_struct *kthread; struct rcuwait rcuwait; struct irq_work irq_work; }; #ifdef CONFIG_LOCKDEP extern void lockdep_assert_console_list_lock_held(void); #else static inline void lockdep_assert_console_list_lock_held(void) { } #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC extern bool console_srcu_read_lock_is_held(void); #else static inline bool console_srcu_read_lock_is_held(void) { return 1; } #endif extern int console_srcu_read_lock(void); extern void console_srcu_read_unlock(int cookie); extern void console_list_lock(void) __acquires(console_mutex); extern void console_list_unlock(void) __releases(console_mutex); extern struct hlist_head console_list; /** * console_srcu_read_flags - Locklessly read flags of a possibly registered * console * @con: struct console pointer of console to read flags from * * Locklessly reading @con->flags provides a consistent read value because * there is at most one CPU modifying @con->flags and that CPU is using only * read-modify-write operations to do so. * * Requires console_srcu_read_lock to be held, which implies that @con might * be a registered console. The purpose of holding console_srcu_read_lock is * to guarantee that the console state is valid (CON_SUSPENDED/CON_ENABLED) * and that no exit/cleanup routines will run if the console is currently * undergoing unregistration. * * If the caller is holding the console_list_lock or it is _certain_ that * @con is not and will not become registered, the caller may read * @con->flags directly instead. * * Context: Any context. * Return: The current value of the @con->flags field. */ static inline short console_srcu_read_flags(const struct console *con) { WARN_ON_ONCE(!console_srcu_read_lock_is_held()); /* * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified * for registered consoles with console_srcu_write_flags(). */ return data_race(READ_ONCE(con->flags)); } /** * console_srcu_write_flags - Write flags for a registered console * @con: struct console pointer of console to write flags to * @flags: new flags value to write * * Only use this function to write flags for registered consoles. It * requires holding the console_list_lock. * * Context: Any context. */ static inline void console_srcu_write_flags(struct console *con, short flags) { lockdep_assert_console_list_lock_held(); /* This matches the READ_ONCE() in console_srcu_read_flags(). */ WRITE_ONCE(con->flags, flags); } /* Variant of console_is_registered() when the console_list_lock is held. */ static inline bool console_is_registered_locked(const struct console *con) { lockdep_assert_console_list_lock_held(); return !hlist_unhashed(&con->node); } /* * console_is_registered - Check if the console is registered * @con: struct console pointer of console to check * * Context: Process context. May sleep while acquiring console list lock. * Return: true if the console is in the console list, otherwise false. * * If false is returned for a console that was previously registered, it * can be assumed that the console's unregistration is fully completed, * including the exit() callback after console list removal. */ static inline bool console_is_registered(const struct console *con) { bool ret; console_list_lock(); ret = console_is_registered_locked(con); console_list_unlock(); return ret; } /** * for_each_console_srcu() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * Although SRCU guarantees the console list will be consistent, the * struct console fields may be updated by other CPUs while iterating. * * Requires console_srcu_read_lock to be held. Can be invoked from * any context. */ #define for_each_console_srcu(con) \ hlist_for_each_entry_srcu(con, &console_list, node, \ console_srcu_read_lock_is_held()) /** * for_each_console() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * The console list and the &console.flags are immutable while iterating. * * Requires console_list_lock to be held. */ #define for_each_console(con) \ lockdep_assert_console_list_lock_held(); \ hlist_for_each_entry(con, &console_list, node) #ifdef CONFIG_PRINTK extern void nbcon_cpu_emergency_enter(void); extern void nbcon_cpu_emergency_exit(void); extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); #else static inline void nbcon_cpu_emergency_enter(void) { } static inline void nbcon_cpu_emergency_exit(void) { } static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } #endif extern int console_set_on_cmdline; extern struct console *early_console; enum con_flush_mode { CONSOLE_FLUSH_PENDING, CONSOLE_REPLAY_ALL, }; extern int add_preferred_console(const char *name, const short idx, char *options); extern void console_force_preferred_locked(struct console *con); extern void register_console(struct console *); extern int unregister_console(struct console *); extern void console_lock(void); extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); extern void console_suspend(struct console *); extern void console_resume(struct console *); extern int is_console_locked(void); extern int braille_register_console(struct console *, int index, char *console_options, char *braille_options); extern int braille_unregister_console(struct console *); #ifdef CONFIG_TTY extern void console_sysfs_notify(void); #else static inline void console_sysfs_notify(void) { } #endif extern bool console_suspend_enabled; /* Suspend and resume console messages over PM events */ extern void console_suspend_all(void); extern void console_resume_all(void); int mda_console_init(void); void vcs_make_sysfs(int index); void vcs_remove_sysfs(int index); /* Some debug stub to catch some of the obvious races in the VT code */ #define WARN_CONSOLE_UNLOCKED() \ WARN_ON(!atomic_read(&ignore_console_lock_warning) && \ !is_console_locked() && !oops_in_progress) /* * Increment ignore_console_lock_warning if you need to quiet * WARN_CONSOLE_UNLOCKED() for debugging purposes. */ extern atomic_t ignore_console_lock_warning; DEFINE_LOCK_GUARD_0(console_lock, console_lock(), console_unlock()); extern void console_init(void); /* For deferred console takeover */ void dummycon_register_output_notifier(struct notifier_block *nb); void dummycon_unregister_output_notifier(struct notifier_block *nb); #endif /* _LINUX_CONSOLE_H */ |
| 36 1 3 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions related to Power Management Quality of Service (PM QoS). * * Copyright (C) 2020 Intel Corporation * * Authors: * Mark Gross <mgross@linux.intel.com> * Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ #ifndef _LINUX_PM_QOS_H #define _LINUX_PM_QOS_H #include <linux/plist.h> #include <linux/notifier.h> #include <linux/device.h> enum pm_qos_flags_status { PM_QOS_FLAGS_UNDEFINED = -1, PM_QOS_FLAGS_NONE, PM_QOS_FLAGS_SOME, PM_QOS_FLAGS_ALL, }; #define PM_QOS_DEFAULT_VALUE (-1) #define PM_QOS_LATENCY_ANY S32_MAX #define PM_QOS_LATENCY_ANY_NS ((s64)PM_QOS_LATENCY_ANY * NSEC_PER_USEC) #define PM_QOS_CPU_LATENCY_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE PM_QOS_LATENCY_ANY #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT PM_QOS_LATENCY_ANY #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS PM_QOS_LATENCY_ANY_NS #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 #define PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE 0 #define PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE FREQ_QOS_MAX_DEFAULT_VALUE #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) #define PM_QOS_FLAG_NO_POWER_OFF (1 << 0) enum pm_qos_type { PM_QOS_UNITIALIZED, PM_QOS_MAX, /* return the largest value */ PM_QOS_MIN, /* return the smallest value */ }; /* * Note: The lockless read path depends on the CPU accessing target_value * or effective_flags atomically. Atomic access is only guaranteed on all CPU * types linux supports for 32 bit quantites */ struct pm_qos_constraints { struct plist_head list; s32 target_value; /* Do not change to 64 bit */ s32 default_value; s32 no_constraint_value; enum pm_qos_type type; struct blocking_notifier_head *notifiers; }; struct pm_qos_request { struct plist_node node; struct pm_qos_constraints *qos; }; struct pm_qos_flags_request { struct list_head node; s32 flags; /* Do not change to 64 bit */ }; struct pm_qos_flags { struct list_head list; s32 effective_flags; /* Do not change to 64 bit */ }; #define FREQ_QOS_MIN_DEFAULT_VALUE 0 #define FREQ_QOS_MAX_DEFAULT_VALUE S32_MAX enum freq_qos_req_type { FREQ_QOS_MIN = 1, FREQ_QOS_MAX, }; struct freq_constraints { struct pm_qos_constraints min_freq; struct blocking_notifier_head min_freq_notifiers; struct pm_qos_constraints max_freq; struct blocking_notifier_head max_freq_notifiers; }; struct freq_qos_request { enum freq_qos_req_type type; struct plist_node pnode; struct freq_constraints *qos; }; enum dev_pm_qos_req_type { DEV_PM_QOS_RESUME_LATENCY = 1, DEV_PM_QOS_LATENCY_TOLERANCE, DEV_PM_QOS_MIN_FREQUENCY, DEV_PM_QOS_MAX_FREQUENCY, DEV_PM_QOS_FLAGS, }; struct dev_pm_qos_request { enum dev_pm_qos_req_type type; union { struct plist_node pnode; struct pm_qos_flags_request flr; struct freq_qos_request freq; } data; struct device *dev; }; struct dev_pm_qos { struct pm_qos_constraints resume_latency; struct pm_qos_constraints latency_tolerance; struct freq_constraints freq; struct pm_qos_flags flags; struct dev_pm_qos_request *resume_latency_req; struct dev_pm_qos_request *latency_tolerance_req; struct dev_pm_qos_request *flags_req; }; /* Action requested to pm_qos_update_target */ enum pm_qos_req_action { PM_QOS_ADD_REQ, /* Add a new request */ PM_QOS_UPDATE_REQ, /* Update an existing request */ PM_QOS_REMOVE_REQ /* Remove an existing request */ }; static inline int dev_pm_qos_request_active(struct dev_pm_qos_request *req) { return req->dev != NULL; } s32 pm_qos_read_value(struct pm_qos_constraints *c); int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, enum pm_qos_req_action action, int value); bool pm_qos_update_flags(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req, enum pm_qos_req_action action, s32 val); #ifdef CONFIG_CPU_IDLE s32 cpu_latency_qos_limit(void); bool cpu_latency_qos_request_active(struct pm_qos_request *req); void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value); void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value); void cpu_latency_qos_remove_request(struct pm_qos_request *req); #else static inline s32 cpu_latency_qos_limit(void) { return INT_MAX; } static inline bool cpu_latency_qos_request_active(struct pm_qos_request *req) { return false; } static inline void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) {} static inline void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) {} static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {} #endif #ifdef CONFIG_PM enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask); enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask); s32 __dev_pm_qos_resume_latency(struct device *dev); s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type); int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value); int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value); int dev_pm_qos_remove_request(struct dev_pm_qos_request *req); int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type); int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type); void dev_pm_qos_constraints_init(struct device *dev); void dev_pm_qos_constraints_destroy(struct device *dev); int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value); int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value); void dev_pm_qos_hide_latency_limit(struct device *dev); int dev_pm_qos_expose_flags(struct device *dev, s32 value); void dev_pm_qos_hide_flags(struct device *dev); int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set); s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev); int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val); int dev_pm_qos_expose_latency_tolerance(struct device *dev); void dev_pm_qos_hide_latency_tolerance(struct device *dev); static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev) { return dev->power.qos->resume_latency_req->data.pnode.prio; } static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return dev->power.qos->flags_req->data.flr.flags; } static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev) { return IS_ERR_OR_NULL(dev->power.qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&dev->power.qos->resume_latency); } #else static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { return PM_QOS_FLAGS_UNDEFINED; } static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { return PM_QOS_FLAGS_UNDEFINED; } static inline s32 __dev_pm_qos_resume_latency(struct device *dev) { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } static inline s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { switch (type) { case DEV_PM_QOS_RESUME_LATENCY: return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; case DEV_PM_QOS_MIN_FREQUENCY: return PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE; case DEV_PM_QOS_MAX_FREQUENCY: return PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE; default: WARN_ON(1); return 0; } } static inline int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { return 0; } static inline int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { return 0; } static inline int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { return 0; } static inline int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { return 0; } static inline int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { return 0; } static inline void dev_pm_qos_constraints_init(struct device *dev) { dev->power.power_state = PMSG_ON; } static inline void dev_pm_qos_constraints_destroy(struct device *dev) { dev->power.power_state = PMSG_INVALID; } static inline int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { return 0; } static inline int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { return 0; } static inline void dev_pm_qos_hide_latency_limit(struct device *dev) {} static inline int dev_pm_qos_expose_flags(struct device *dev, s32 value) { return 0; } static inline void dev_pm_qos_hide_flags(struct device *dev) {} static inline int dev_pm_qos_update_flags(struct device *dev, s32 m, bool set) { return 0; } static inline s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { return PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; } static inline int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { return 0; } static inline int dev_pm_qos_expose_latency_tolerance(struct device *dev) { return 0; } static inline void dev_pm_qos_hide_latency_tolerance(struct device *dev) {} static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev) { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; } static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev) { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } #endif static inline int freq_qos_request_active(struct freq_qos_request *req) { return !IS_ERR_OR_NULL(req->qos); } void freq_constraints_init(struct freq_constraints *qos); s32 freq_qos_read_value(struct freq_constraints *qos, enum freq_qos_req_type type); int freq_qos_add_request(struct freq_constraints *qos, struct freq_qos_request *req, enum freq_qos_req_type type, s32 value); int freq_qos_update_request(struct freq_qos_request *req, s32 new_value); int freq_qos_remove_request(struct freq_qos_request *req); int freq_qos_apply(struct freq_qos_request *req, enum pm_qos_req_action action, s32 value); int freq_qos_add_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, struct notifier_block *notifier); int freq_qos_remove_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, struct notifier_block *notifier); #endif |
| 18 18 18 18 10 10 10 10 10 7 7 7 4 1 1 11 11 2 2 1 1 4 2 11 11 5 5 5 5 5 2 2 2 2 5 5 5 4 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 1 4 4 4 4 1 1 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 | // SPDX-License-Identifier: GPL-2.0-or-later /* * PPP async serial channel driver for Linux. * * Copyright 1999 Paul Mackerras. * * This driver provides the encapsulation and framing for sending * and receiving PPP frames over async serial lines. It relies on * the generic PPP layer to give it frames to send and to process * received frames. It implements the PPP line discipline. * * Part of the code in this driver was inspired by the old async-only * PPP driver, written by Michael Callahan and Al Longyear, and * subsequently hacked by Paul Mackerras. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/tty.h> #include <linux/netdevice.h> #include <linux/poll.h> #include <linux/crc-ccitt.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/ppp_channel.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/unaligned.h> #include <linux/uaccess.h> #include <asm/string.h> #define PPP_VERSION "2.4.2" #define OBUFSIZE 4096 /* Structure for storing local state. */ struct asyncppp { struct tty_struct *tty; unsigned int flags; unsigned int state; unsigned int rbits; int mru; spinlock_t xmit_lock; spinlock_t recv_lock; unsigned long xmit_flags; u32 xaccm[8]; u32 raccm; unsigned int bytes_sent; unsigned int bytes_rcvd; struct sk_buff *tpkt; int tpkt_pos; u16 tfcs; unsigned char *optr; unsigned char *olim; unsigned long last_xmit; struct sk_buff *rpkt; int lcp_fcs; struct sk_buff_head rqueue; struct tasklet_struct tsk; refcount_t refcnt; struct completion dead; struct ppp_channel chan; /* interface to generic ppp layer */ unsigned char obuf[OBUFSIZE]; }; /* Bit numbers in xmit_flags */ #define XMIT_WAKEUP 0 #define XMIT_FULL 1 #define XMIT_BUSY 2 /* State bits */ #define SC_TOSS 1 #define SC_ESCAPE 2 #define SC_PREV_ERROR 4 /* Bits in rbits */ #define SC_RCV_BITS (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP) static int flag_time = HZ; module_param(flag_time, int, 0); MODULE_PARM_DESC(flag_time, "ppp_async: interval between flagged packets (in clock ticks)"); MODULE_DESCRIPTION("PPP async serial channel module"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_PPP); /* * Prototypes. */ static int ppp_async_encode(struct asyncppp *ap); static int ppp_async_send(struct ppp_channel *chan, struct sk_buff *skb); static int ppp_async_push(struct asyncppp *ap); static void ppp_async_flush_output(struct asyncppp *ap); static void ppp_async_input(struct asyncppp *ap, const unsigned char *buf, const u8 *flags, int count); static int ppp_async_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg); static void ppp_async_process(struct tasklet_struct *t); static void async_lcp_peek(struct asyncppp *ap, unsigned char *data, int len, int inbound); static const struct ppp_channel_ops async_ops = { .start_xmit = ppp_async_send, .ioctl = ppp_async_ioctl, }; /* * Routines implementing the PPP line discipline. */ /* * We have a potential race on dereferencing tty->disc_data, * because the tty layer provides no locking at all - thus one * cpu could be running ppp_asynctty_receive while another * calls ppp_asynctty_close, which zeroes tty->disc_data and * frees the memory that ppp_asynctty_receive is using. The best * way to fix this is to use a rwlock in the tty struct, but for now * we use a single global rwlock for all ttys in ppp line discipline. * * FIXME: this is no longer true. The _close path for the ldisc is * now guaranteed to be sane. */ static DEFINE_RWLOCK(disc_data_lock); static struct asyncppp *ap_get(struct tty_struct *tty) { struct asyncppp *ap; read_lock(&disc_data_lock); ap = tty->disc_data; if (ap != NULL) refcount_inc(&ap->refcnt); read_unlock(&disc_data_lock); return ap; } static void ap_put(struct asyncppp *ap) { if (refcount_dec_and_test(&ap->refcnt)) complete(&ap->dead); } /* * Called when a tty is put into PPP line discipline. Called in process * context. */ static int ppp_asynctty_open(struct tty_struct *tty) { struct asyncppp *ap; int err; int speed; if (tty->ops->write == NULL) return -EOPNOTSUPP; err = -ENOMEM; ap = kzalloc(sizeof(*ap), GFP_KERNEL); if (!ap) goto out; /* initialize the asyncppp structure */ ap->tty = tty; ap->mru = PPP_MRU; spin_lock_init(&ap->xmit_lock); spin_lock_init(&ap->recv_lock); ap->xaccm[0] = ~0U; ap->xaccm[3] = 0x60000000U; ap->raccm = ~0U; ap->optr = ap->obuf; ap->olim = ap->obuf; ap->lcp_fcs = -1; skb_queue_head_init(&ap->rqueue); tasklet_setup(&ap->tsk, ppp_async_process); refcount_set(&ap->refcnt, 1); init_completion(&ap->dead); ap->chan.private = ap; ap->chan.ops = &async_ops; ap->chan.mtu = PPP_MRU; speed = tty_get_baud_rate(tty); ap->chan.speed = speed; err = ppp_register_channel(&ap->chan); if (err) goto out_free; tty->disc_data = ap; tty->receive_room = 65536; return 0; out_free: kfree(ap); out: return err; } /* * Called when the tty is put into another line discipline * or it hangs up. We have to wait for any cpu currently * executing in any of the other ppp_asynctty_* routines to * finish before we can call ppp_unregister_channel and free * the asyncppp struct. This routine must be called from * process context, not interrupt or softirq context. */ static void ppp_asynctty_close(struct tty_struct *tty) { struct asyncppp *ap; write_lock_irq(&disc_data_lock); ap = tty->disc_data; tty->disc_data = NULL; write_unlock_irq(&disc_data_lock); if (!ap) return; /* * We have now ensured that nobody can start using ap from now * on, but we have to wait for all existing users to finish. * Note that ppp_unregister_channel ensures that no calls to * our channel ops (i.e. ppp_async_send/ioctl) are in progress * by the time it returns. */ if (!refcount_dec_and_test(&ap->refcnt)) wait_for_completion(&ap->dead); tasklet_kill(&ap->tsk); ppp_unregister_channel(&ap->chan); kfree_skb(ap->rpkt); skb_queue_purge(&ap->rqueue); kfree_skb(ap->tpkt); kfree(ap); } /* * Called on tty hangup in process context. * * Wait for I/O to driver to complete and unregister PPP channel. * This is already done by the close routine, so just call that. */ static void ppp_asynctty_hangup(struct tty_struct *tty) { ppp_asynctty_close(tty); } /* * Read does nothing - no data is ever available this way. * Pppd reads and writes packets via /dev/ppp instead. */ static ssize_t ppp_asynctty_read(struct tty_struct *tty, struct file *file, u8 *buf, size_t count, void **cookie, unsigned long offset) { return -EAGAIN; } /* * Write on the tty does nothing, the packets all come in * from the ppp generic stuff. */ static ssize_t ppp_asynctty_write(struct tty_struct *tty, struct file *file, const u8 *buf, size_t count) { return -EAGAIN; } /* * Called in process context only. May be re-entered by multiple * ioctl calling threads. */ static int ppp_asynctty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct asyncppp *ap = ap_get(tty); int err, val; int __user *p = (int __user *)arg; if (!ap) return -ENXIO; err = -EFAULT; switch (cmd) { case PPPIOCGCHAN: err = -EFAULT; if (put_user(ppp_channel_index(&ap->chan), p)) break; err = 0; break; case PPPIOCGUNIT: err = -EFAULT; if (put_user(ppp_unit_number(&ap->chan), p)) break; err = 0; break; case TCFLSH: /* flush our buffers and the serial port's buffer */ if (arg == TCIOFLUSH || arg == TCOFLUSH) ppp_async_flush_output(ap); err = n_tty_ioctl_helper(tty, cmd, arg); break; case FIONREAD: val = 0; if (put_user(val, p)) break; err = 0; break; default: /* Try the various mode ioctls */ err = tty_mode_ioctl(tty, cmd, arg); } ap_put(ap); return err; } /* May sleep, don't call from interrupt level or with interrupts disabled */ static void ppp_asynctty_receive(struct tty_struct *tty, const u8 *buf, const u8 *cflags, size_t count) { struct asyncppp *ap = ap_get(tty); unsigned long flags; if (!ap) return; spin_lock_irqsave(&ap->recv_lock, flags); ppp_async_input(ap, buf, cflags, count); spin_unlock_irqrestore(&ap->recv_lock, flags); if (!skb_queue_empty(&ap->rqueue)) tasklet_schedule(&ap->tsk); ap_put(ap); tty_unthrottle(tty); } static void ppp_asynctty_wakeup(struct tty_struct *tty) { struct asyncppp *ap = ap_get(tty); clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); if (!ap) return; set_bit(XMIT_WAKEUP, &ap->xmit_flags); tasklet_schedule(&ap->tsk); ap_put(ap); } static struct tty_ldisc_ops ppp_ldisc = { .owner = THIS_MODULE, .num = N_PPP, .name = "ppp", .open = ppp_asynctty_open, .close = ppp_asynctty_close, .hangup = ppp_asynctty_hangup, .read = ppp_asynctty_read, .write = ppp_asynctty_write, .ioctl = ppp_asynctty_ioctl, .receive_buf = ppp_asynctty_receive, .write_wakeup = ppp_asynctty_wakeup, }; static int __init ppp_async_init(void) { int err; err = tty_register_ldisc(&ppp_ldisc); if (err != 0) printk(KERN_ERR "PPP_async: error %d registering line disc.\n", err); return err; } /* * The following routines provide the PPP channel interface. */ static int ppp_async_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg) { struct asyncppp *ap = chan->private; void __user *argp = (void __user *)arg; int __user *p = argp; int err, val; u32 accm[8]; err = -EFAULT; switch (cmd) { case PPPIOCGFLAGS: val = ap->flags | ap->rbits; if (put_user(val, p)) break; err = 0; break; case PPPIOCSFLAGS: if (get_user(val, p)) break; ap->flags = val & ~SC_RCV_BITS; spin_lock_irq(&ap->recv_lock); ap->rbits = val & SC_RCV_BITS; spin_unlock_irq(&ap->recv_lock); err = 0; break; case PPPIOCGASYNCMAP: if (put_user(ap->xaccm[0], (u32 __user *)argp)) break; err = 0; break; case PPPIOCSASYNCMAP: if (get_user(ap->xaccm[0], (u32 __user *)argp)) break; err = 0; break; case PPPIOCGRASYNCMAP: if (put_user(ap->raccm, (u32 __user *)argp)) break; err = 0; break; case PPPIOCSRASYNCMAP: if (get_user(ap->raccm, (u32 __user *)argp)) break; err = 0; break; case PPPIOCGXASYNCMAP: if (copy_to_user(argp, ap->xaccm, sizeof(ap->xaccm))) break; err = 0; break; case PPPIOCSXASYNCMAP: if (copy_from_user(accm, argp, sizeof(accm))) break; accm[2] &= ~0x40000000U; /* can't escape 0x5e */ accm[3] |= 0x60000000U; /* must escape 0x7d, 0x7e */ memcpy(ap->xaccm, accm, sizeof(ap->xaccm)); err = 0; break; case PPPIOCGMRU: if (put_user(ap->mru, p)) break; err = 0; break; case PPPIOCSMRU: if (get_user(val, p)) break; if (val > U16_MAX) { err = -EINVAL; break; } if (val < PPP_MRU) val = PPP_MRU; ap->mru = val; err = 0; break; default: err = -ENOTTY; } return err; } /* * This is called at softirq level to deliver received packets * to the ppp_generic code, and to tell the ppp_generic code * if we can accept more output now. */ static void ppp_async_process(struct tasklet_struct *t) { struct asyncppp *ap = from_tasklet(ap, t, tsk); struct sk_buff *skb; /* process received packets */ while ((skb = skb_dequeue(&ap->rqueue)) != NULL) { if (skb->cb[0]) ppp_input_error(&ap->chan, 0); ppp_input(&ap->chan, skb); } /* try to push more stuff out */ if (test_bit(XMIT_WAKEUP, &ap->xmit_flags) && ppp_async_push(ap)) ppp_output_wakeup(&ap->chan); } /* * Procedures for encapsulation and framing. */ /* * Procedure to encode the data for async serial transmission. * Does octet stuffing (escaping), puts the address/control bytes * on if A/C compression is disabled, and does protocol compression. * Assumes ap->tpkt != 0 on entry. * Returns 1 if we finished the current frame, 0 otherwise. */ #define PUT_BYTE(ap, buf, c, islcp) do { \ if ((islcp && c < 0x20) || (ap->xaccm[c >> 5] & (1 << (c & 0x1f)))) {\ *buf++ = PPP_ESCAPE; \ *buf++ = c ^ PPP_TRANS; \ } else \ *buf++ = c; \ } while (0) static int ppp_async_encode(struct asyncppp *ap) { int fcs, i, count, c, proto; unsigned char *buf, *buflim; unsigned char *data; int islcp; buf = ap->obuf; ap->olim = buf; ap->optr = buf; i = ap->tpkt_pos; data = ap->tpkt->data; count = ap->tpkt->len; fcs = ap->tfcs; proto = get_unaligned_be16(data); /* * LCP packets with code values between 1 (configure-request) * and 7 (code-reject) must be sent as though no options * had been negotiated. */ islcp = proto == PPP_LCP && count >= 3 && 1 <= data[2] && data[2] <= 7; if (i == 0) { if (islcp) async_lcp_peek(ap, data, count, 0); /* * Start of a new packet - insert the leading FLAG * character if necessary. */ if (islcp || flag_time == 0 || time_after_eq(jiffies, ap->last_xmit + flag_time)) *buf++ = PPP_FLAG; ap->last_xmit = jiffies; fcs = PPP_INITFCS; /* * Put in the address/control bytes if necessary */ if ((ap->flags & SC_COMP_AC) == 0 || islcp) { PUT_BYTE(ap, buf, 0xff, islcp); fcs = PPP_FCS(fcs, 0xff); PUT_BYTE(ap, buf, 0x03, islcp); fcs = PPP_FCS(fcs, 0x03); } } /* * Once we put in the last byte, we need to put in the FCS * and closing flag, so make sure there is at least 7 bytes * of free space in the output buffer. */ buflim = ap->obuf + OBUFSIZE - 6; while (i < count && buf < buflim) { c = data[i++]; if (i == 1 && c == 0 && (ap->flags & SC_COMP_PROT)) continue; /* compress protocol field */ fcs = PPP_FCS(fcs, c); PUT_BYTE(ap, buf, c, islcp); } if (i < count) { /* * Remember where we are up to in this packet. */ ap->olim = buf; ap->tpkt_pos = i; ap->tfcs = fcs; return 0; } /* * We have finished the packet. Add the FCS and flag. */ fcs = ~fcs; c = fcs & 0xff; PUT_BYTE(ap, buf, c, islcp); c = (fcs >> 8) & 0xff; PUT_BYTE(ap, buf, c, islcp); *buf++ = PPP_FLAG; ap->olim = buf; consume_skb(ap->tpkt); ap->tpkt = NULL; return 1; } /* * Transmit-side routines. */ /* * Send a packet to the peer over an async tty line. * Returns 1 iff the packet was accepted. * If the packet was not accepted, we will call ppp_output_wakeup * at some later time. */ static int ppp_async_send(struct ppp_channel *chan, struct sk_buff *skb) { struct asyncppp *ap = chan->private; ppp_async_push(ap); if (test_and_set_bit(XMIT_FULL, &ap->xmit_flags)) return 0; /* already full */ ap->tpkt = skb; ap->tpkt_pos = 0; ppp_async_push(ap); return 1; } /* * Push as much data as possible out to the tty. */ static int ppp_async_push(struct asyncppp *ap) { int avail, sent, done = 0; struct tty_struct *tty = ap->tty; int tty_stuffed = 0; /* * We can get called recursively here if the tty write * function calls our wakeup function. This can happen * for example on a pty with both the master and slave * set to PPP line discipline. * We use the XMIT_BUSY bit to detect this and get out, * leaving the XMIT_WAKEUP bit set to tell the other * instance that it may now be able to write more now. */ if (test_and_set_bit(XMIT_BUSY, &ap->xmit_flags)) return 0; spin_lock_bh(&ap->xmit_lock); for (;;) { if (test_and_clear_bit(XMIT_WAKEUP, &ap->xmit_flags)) tty_stuffed = 0; if (!tty_stuffed && ap->optr < ap->olim) { avail = ap->olim - ap->optr; set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); sent = tty->ops->write(tty, ap->optr, avail); if (sent < 0) goto flush; /* error, e.g. loss of CD */ ap->optr += sent; if (sent < avail) tty_stuffed = 1; continue; } if (ap->optr >= ap->olim && ap->tpkt) { if (ppp_async_encode(ap)) { /* finished processing ap->tpkt */ clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } continue; } /* * We haven't made any progress this time around. * Clear XMIT_BUSY to let other callers in, but * after doing so we have to check if anyone set * XMIT_WAKEUP since we last checked it. If they * did, we should try again to set XMIT_BUSY and go * around again in case XMIT_BUSY was still set when * the other caller tried. */ clear_bit(XMIT_BUSY, &ap->xmit_flags); /* any more work to do? if not, exit the loop */ if (!(test_bit(XMIT_WAKEUP, &ap->xmit_flags) || (!tty_stuffed && ap->tpkt))) break; /* more work to do, see if we can do it now */ if (test_and_set_bit(XMIT_BUSY, &ap->xmit_flags)) break; } spin_unlock_bh(&ap->xmit_lock); return done; flush: clear_bit(XMIT_BUSY, &ap->xmit_flags); if (ap->tpkt) { kfree_skb(ap->tpkt); ap->tpkt = NULL; clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } ap->optr = ap->olim; spin_unlock_bh(&ap->xmit_lock); return done; } /* * Flush output from our internal buffers. * Called for the TCFLSH ioctl. Can be entered in parallel * but this is covered by the xmit_lock. */ static void ppp_async_flush_output(struct asyncppp *ap) { int done = 0; spin_lock_bh(&ap->xmit_lock); ap->optr = ap->olim; if (ap->tpkt != NULL) { kfree_skb(ap->tpkt); ap->tpkt = NULL; clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } spin_unlock_bh(&ap->xmit_lock); if (done) ppp_output_wakeup(&ap->chan); } /* * Receive-side routines. */ /* see how many ordinary chars there are at the start of buf */ static inline int scan_ordinary(struct asyncppp *ap, const unsigned char *buf, int count) { int i, c; for (i = 0; i < count; ++i) { c = buf[i]; if (c == PPP_ESCAPE || c == PPP_FLAG || (c < 0x20 && (ap->raccm & (1 << c)) != 0)) break; } return i; } /* called when a flag is seen - do end-of-packet processing */ static void process_input_packet(struct asyncppp *ap) { struct sk_buff *skb; unsigned char *p; unsigned int len, fcs; skb = ap->rpkt; if (ap->state & (SC_TOSS | SC_ESCAPE)) goto err; if (skb == NULL) return; /* 0-length packet */ /* check the FCS */ p = skb->data; len = skb->len; if (len < 3) goto err; /* too short */ fcs = PPP_INITFCS; for (; len > 0; --len) fcs = PPP_FCS(fcs, *p++); if (fcs != PPP_GOODFCS) goto err; /* bad FCS */ skb_trim(skb, skb->len - 2); /* check for address/control and protocol compression */ p = skb->data; if (p[0] == PPP_ALLSTATIONS) { /* chop off address/control */ if (p[1] != PPP_UI || skb->len < 3) goto err; p = skb_pull(skb, 2); } /* If protocol field is not compressed, it can be LCP packet */ if (!(p[0] & 0x01)) { unsigned int proto; if (skb->len < 2) goto err; proto = (p[0] << 8) + p[1]; if (proto == PPP_LCP) async_lcp_peek(ap, p, skb->len, 1); } /* queue the frame to be processed */ skb->cb[0] = ap->state; skb_queue_tail(&ap->rqueue, skb); ap->rpkt = NULL; ap->state = 0; return; err: /* frame had an error, remember that, reset SC_TOSS & SC_ESCAPE */ ap->state = SC_PREV_ERROR; if (skb) { /* make skb appear as freshly allocated */ skb_trim(skb, 0); skb_reserve(skb, - skb_headroom(skb)); } } /* Called when the tty driver has data for us. Runs parallel with the other ldisc functions but will not be re-entered */ static void ppp_async_input(struct asyncppp *ap, const u8 *buf, const u8 *flags, int count) { struct sk_buff *skb; int c, i, j, n, s, f; unsigned char *sp; /* update bits used for 8-bit cleanness detection */ if (~ap->rbits & SC_RCV_BITS) { s = 0; for (i = 0; i < count; ++i) { c = buf[i]; if (flags && flags[i] != 0) continue; s |= (c & 0x80)? SC_RCV_B7_1: SC_RCV_B7_0; c = ((c >> 4) ^ c) & 0xf; s |= (0x6996 & (1 << c))? SC_RCV_ODDP: SC_RCV_EVNP; } ap->rbits |= s; } while (count > 0) { /* scan through and see how many chars we can do in bulk */ if ((ap->state & SC_ESCAPE) && buf[0] == PPP_ESCAPE) n = 1; else n = scan_ordinary(ap, buf, count); f = 0; if (flags && (ap->state & SC_TOSS) == 0) { /* check the flags to see if any char had an error */ for (j = 0; j < n; ++j) if ((f = flags[j]) != 0) break; } if (f != 0) { /* start tossing */ ap->state |= SC_TOSS; } else if (n > 0 && (ap->state & SC_TOSS) == 0) { /* stuff the chars in the skb */ skb = ap->rpkt; if (!skb) { skb = dev_alloc_skb(ap->mru + PPP_HDRLEN + 2); if (!skb) goto nomem; ap->rpkt = skb; } if (skb->len == 0) { /* Try to get the payload 4-byte aligned. * This should match the * PPP_ALLSTATIONS/PPP_UI/compressed tests in * process_input_packet, but we do not have * enough chars here to test buf[1] and buf[2]. */ if (buf[0] != PPP_ALLSTATIONS) skb_reserve(skb, 2 + (buf[0] & 1)); } if (n > skb_tailroom(skb)) { /* packet overflowed MRU */ ap->state |= SC_TOSS; } else { sp = skb_put_data(skb, buf, n); if (ap->state & SC_ESCAPE) { sp[0] ^= PPP_TRANS; ap->state &= ~SC_ESCAPE; } } } if (n >= count) break; c = buf[n]; if (flags != NULL && flags[n] != 0) { ap->state |= SC_TOSS; } else if (c == PPP_FLAG) { process_input_packet(ap); } else if (c == PPP_ESCAPE) { ap->state |= SC_ESCAPE; } else if (I_IXON(ap->tty)) { if (c == START_CHAR(ap->tty)) start_tty(ap->tty); else if (c == STOP_CHAR(ap->tty)) stop_tty(ap->tty); } /* otherwise it's a char in the recv ACCM */ ++n; buf += n; if (flags) flags += n; count -= n; } return; nomem: printk(KERN_ERR "PPPasync: no memory (input pkt)\n"); ap->state |= SC_TOSS; } /* * We look at LCP frames going past so that we can notice * and react to the LCP configure-ack from the peer. * In the situation where the peer has been sent a configure-ack * already, LCP is up once it has sent its configure-ack * so the immediately following packet can be sent with the * configured LCP options. This allows us to process the following * packet correctly without pppd needing to respond quickly. * * We only respond to the received configure-ack if we have just * sent a configure-request, and the configure-ack contains the * same data (this is checked using a 16-bit crc of the data). */ #define CONFREQ 1 /* LCP code field values */ #define CONFACK 2 #define LCP_MRU 1 /* LCP option numbers */ #define LCP_ASYNCMAP 2 static void async_lcp_peek(struct asyncppp *ap, unsigned char *data, int len, int inbound) { int dlen, fcs, i, code; u32 val; data += 2; /* skip protocol bytes */ len -= 2; if (len < 4) /* 4 = code, ID, length */ return; code = data[0]; if (code != CONFACK && code != CONFREQ) return; dlen = get_unaligned_be16(data + 2); if (len < dlen) return; /* packet got truncated or length is bogus */ if (code == (inbound? CONFACK: CONFREQ)) { /* * sent confreq or received confack: * calculate the crc of the data from the ID field on. */ fcs = PPP_INITFCS; for (i = 1; i < dlen; ++i) fcs = PPP_FCS(fcs, data[i]); if (!inbound) { /* outbound confreq - remember the crc for later */ ap->lcp_fcs = fcs; return; } /* received confack, check the crc */ fcs ^= ap->lcp_fcs; ap->lcp_fcs = -1; if (fcs != 0) return; } else if (inbound) return; /* not interested in received confreq */ /* process the options in the confack */ data += 4; dlen -= 4; /* data[0] is code, data[1] is length */ while (dlen >= 2 && dlen >= data[1] && data[1] >= 2) { switch (data[0]) { case LCP_MRU: val = get_unaligned_be16(data + 2); if (inbound) ap->mru = val; else ap->chan.mtu = val; break; case LCP_ASYNCMAP: val = get_unaligned_be32(data + 2); if (inbound) ap->raccm = val; else ap->xaccm[0] = val; break; } dlen -= data[1]; data += data[1]; } } static void __exit ppp_async_cleanup(void) { tty_unregister_ldisc(&ppp_ldisc); } module_init(ppp_async_init); module_exit(ppp_async_cleanup); |
| 6 10 4 6 10 10 10 6 6 6 6 5 6 5 4 5 5 5 5 1 1 4 4 4 4 1 1 6 6 6 6 4 4 4 4 4 4 4 6 6 8 8 7 1 8 6 6 6 2 4 4 4 4 5 5 8 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 | // SPDX-License-Identifier: GPL-2.0-or-later /* AFS cell and server record management * * Copyright (C) 2002, 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/key.h> #include <linux/ctype.h> #include <linux/dns_resolver.h> #include <linux/sched.h> #include <linux/inet.h> #include <linux/namei.h> #include <keys/rxrpc-type.h> #include "internal.h" static unsigned __read_mostly afs_cell_gc_delay = 10; static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; static atomic_t cell_debug_id; static void afs_cell_timer(struct timer_list *timer); static void afs_destroy_cell_work(struct work_struct *work); static void afs_manage_cell_work(struct work_struct *work); static void afs_dec_cells_outstanding(struct afs_net *net) { if (atomic_dec_and_test(&net->cells_outstanding)) wake_up_var(&net->cells_outstanding); } static void afs_set_cell_state(struct afs_cell *cell, enum afs_cell_state state) { smp_store_release(&cell->state, state); /* Commit cell changes before state */ smp_wmb(); /* Set cell state before task state */ wake_up_var(&cell->state); } /* * Look up and get an activation reference on a cell record. The caller must * hold net->cells_lock at least read-locked. */ static struct afs_cell *afs_find_cell_locked(struct afs_net *net, const char *name, unsigned int namesz, enum afs_cell_trace reason) { struct afs_cell *cell = NULL; struct rb_node *p; int n; _enter("%*.*s", namesz, namesz, name); if (name && namesz == 0) return ERR_PTR(-EINVAL); if (namesz > AFS_MAXCELLNAME) return ERR_PTR(-ENAMETOOLONG); if (!name) { cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock)); if (!cell) return ERR_PTR(-EDESTADDRREQ); goto found; } p = net->cells.rb_node; while (p) { cell = rb_entry(p, struct afs_cell, net_node); n = strncasecmp(cell->name, name, min_t(size_t, cell->name_len, namesz)); if (n == 0) n = cell->name_len - namesz; if (n < 0) p = p->rb_left; else if (n > 0) p = p->rb_right; else goto found; } return ERR_PTR(-ENOENT); found: return afs_use_cell(cell, reason); } /* * Look up and get an activation reference on a cell record. */ struct afs_cell *afs_find_cell(struct afs_net *net, const char *name, unsigned int namesz, enum afs_cell_trace reason) { struct afs_cell *cell; down_read(&net->cells_lock); cell = afs_find_cell_locked(net, name, namesz, reason); up_read(&net->cells_lock); return cell; } /* * Set up a cell record and fill in its name, VL server address list and * allocate an anonymous key */ static struct afs_cell *afs_alloc_cell(struct afs_net *net, const char *name, unsigned int namelen, const char *addresses) { struct afs_vlserver_list *vllist = NULL; struct afs_cell *cell; int i, ret; ASSERT(name); if (namelen == 0) return ERR_PTR(-EINVAL); if (namelen > AFS_MAXCELLNAME) { _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); } /* Prohibit cell names that contain unprintable chars, '/' and '@' or * that begin with a dot. This also precludes "@cell". */ if (name[0] == '.') return ERR_PTR(-EINVAL); for (i = 0; i < namelen; i++) { char ch = name[i]; if (!isprint(ch) || ch == '/' || ch == '@') return ERR_PTR(-EINVAL); } _enter("%*.*s,%s", namelen, namelen, name, addresses); cell = kzalloc(sizeof(struct afs_cell), GFP_KERNEL); if (!cell) { _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } /* Allocate the cell name and the key name in one go. */ cell->name = kmalloc(1 + namelen + 1 + 4 + namelen + 1, GFP_KERNEL); if (!cell->name) { kfree(cell); return ERR_PTR(-ENOMEM); } cell->name[0] = '.'; cell->name++; cell->name_len = namelen; for (i = 0; i < namelen; i++) cell->name[i] = tolower(name[i]); cell->name[i++] = 0; cell->key_desc = cell->name + i; memcpy(cell->key_desc, "afs@", 4); memcpy(cell->key_desc + 4, cell->name, cell->name_len + 1); cell->net = net; refcount_set(&cell->ref, 1); atomic_set(&cell->active, 0); INIT_WORK(&cell->destroyer, afs_destroy_cell_work); INIT_WORK(&cell->manager, afs_manage_cell_work); timer_setup(&cell->management_timer, afs_cell_timer, 0); init_rwsem(&cell->vs_lock); cell->volumes = RB_ROOT; INIT_HLIST_HEAD(&cell->proc_volumes); seqlock_init(&cell->volume_lock); cell->fs_servers = RB_ROOT; init_rwsem(&cell->fs_lock); rwlock_init(&cell->vl_servers_lock); cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); /* Provide a VL server list, filling it in if we were given a list of * addresses to use. */ if (addresses) { vllist = afs_parse_text_addrs(net, addresses, strlen(addresses), ':', VL_SERVICE, AFS_VL_PORT); if (IS_ERR(vllist)) { ret = PTR_ERR(vllist); vllist = NULL; goto parse_failed; } vllist->source = DNS_RECORD_FROM_CONFIG; vllist->status = DNS_LOOKUP_NOT_DONE; cell->dns_expiry = TIME64_MAX; } else { ret = -ENOMEM; vllist = afs_alloc_vlserver_list(0); if (!vllist) goto error; vllist->source = DNS_RECORD_UNAVAILABLE; vllist->status = DNS_LOOKUP_NOT_DONE; cell->dns_expiry = ktime_get_real_seconds(); } rcu_assign_pointer(cell->vl_servers, vllist); cell->dns_source = vllist->source; cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ atomic_inc(&net->cells_outstanding); ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell, 2, INT_MAX / 2, GFP_KERNEL); if (ret < 0) goto error; cell->dynroot_ino = ret; cell->debug_id = atomic_inc_return(&cell_debug_id); trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc); _leave(" = %p", cell); return cell; parse_failed: if (ret == -EINVAL) printk(KERN_ERR "kAFS: bad VL server IP address\n"); error: afs_put_vlserverlist(cell->net, vllist); kfree(cell->name - 1); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); } /* * afs_lookup_cell - Look up or create a cell record. * @net: The network namespace * @name: The name of the cell. * @namesz: The strlen of the cell name. * @vllist: A colon/comma separated list of numeric IP addresses or NULL. * @reason: The reason we're doing the lookup * @trace: The reason to be logged if the lookup is successful. * * Look up a cell record by name and query the DNS for VL server addresses if * needed. Note that that actual DNS query is punted off to the manager thread * so that this function can return immediately if interrupted whilst allowing * cell records to be shared even if not yet fully constructed. */ struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, const char *vllist, enum afs_lookup_cell_for reason, enum afs_cell_trace trace) { struct afs_cell *cell, *candidate, *cursor; struct rb_node *parent, **pp; enum afs_cell_state state; int ret, n; _enter("%s,%s,%u", name, vllist, reason); if (reason != AFS_LOOKUP_CELL_PRELOAD) { cell = afs_find_cell(net, name, namesz, trace); if (!IS_ERR(cell)) { if (reason == AFS_LOOKUP_CELL_DYNROOT) goto no_wait; if (cell->state == AFS_CELL_SETTING_UP || cell->state == AFS_CELL_UNLOOKED) goto lookup_cell; goto wait_for_cell; } } /* Assume we're probably going to create a cell and preallocate and * mostly set up a candidate record. We can then use this to stash the * name, the net namespace and VL server addresses. * * We also want to do this before we hold any locks as it may involve * upcalling to userspace to make DNS queries. */ candidate = afs_alloc_cell(net, name, namesz, vllist); if (IS_ERR(candidate)) { _leave(" = %ld", PTR_ERR(candidate)); return candidate; } /* Find the insertion point and check to see if someone else added a * cell whilst we were allocating. */ down_write(&net->cells_lock); pp = &net->cells.rb_node; parent = NULL; while (*pp) { parent = *pp; cursor = rb_entry(parent, struct afs_cell, net_node); n = strncasecmp(cursor->name, name, min_t(size_t, cursor->name_len, namesz)); if (n == 0) n = cursor->name_len - namesz; if (n < 0) pp = &(*pp)->rb_left; else if (n > 0) pp = &(*pp)->rb_right; else goto cell_already_exists; } cell = candidate; candidate = NULL; afs_use_cell(cell, trace); rb_link_node_rcu(&cell->net_node, parent, pp); rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); lookup_cell: if (reason != AFS_LOOKUP_CELL_PRELOAD && reason != AFS_LOOKUP_CELL_ROOTCELL) { set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); afs_queue_cell(cell, afs_cell_trace_queue_new); } wait_for_cell: state = smp_load_acquire(&cell->state); /* vs error */ switch (state) { case AFS_CELL_ACTIVE: case AFS_CELL_DEAD: break; case AFS_CELL_UNLOOKED: default: if (reason == AFS_LOOKUP_CELL_PRELOAD || reason == AFS_LOOKUP_CELL_ROOTCELL) break; _debug("wait_for_cell"); afs_see_cell(cell, afs_cell_trace_wait); wait_var_event(&cell->state, ({ state = smp_load_acquire(&cell->state); /* vs error */ state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; })); _debug("waited_for_cell %d %d", cell->state, cell->error); } no_wait: /* Check the state obtained from the wait check. */ state = smp_load_acquire(&cell->state); /* vs error */ if (state == AFS_CELL_DEAD) { ret = cell->error; goto error; } if (state == AFS_CELL_ACTIVE) { switch (cell->dns_status) { case DNS_LOOKUP_NOT_DONE: if (cell->dns_source == DNS_RECORD_FROM_CONFIG) { ret = 0; break; } fallthrough; default: ret = -EIO; goto error; case DNS_LOOKUP_GOOD: case DNS_LOOKUP_GOOD_WITH_BAD: ret = 0; break; case DNS_LOOKUP_GOT_NOT_FOUND: ret = -ENOENT; goto error; case DNS_LOOKUP_BAD: ret = -EREMOTEIO; goto error; case DNS_LOOKUP_GOT_LOCAL_FAILURE: case DNS_LOOKUP_GOT_TEMP_FAILURE: case DNS_LOOKUP_GOT_NS_FAILURE: ret = -EDESTADDRREQ; goto error; } } _leave(" = %p [cell]", cell); return cell; cell_already_exists: _debug("cell exists"); cell = cursor; if (reason == AFS_LOOKUP_CELL_PRELOAD) { ret = -EEXIST; } else { afs_use_cell(cursor, trace); ret = 0; } up_write(&net->cells_lock); if (candidate) afs_put_cell(candidate, afs_cell_trace_put_candidate); if (ret == 0) goto wait_for_cell; goto error_noput; error: afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_error); error_noput: _leave(" = %d [error]", ret); return ERR_PTR(ret); } /* * set the root cell information * - can be called with a module parameter string * - can be called from a write to /proc/fs/afs/rootcell */ int afs_cell_init(struct afs_net *net, const char *rootcell) { struct afs_cell *old_root, *new_root; const char *cp, *vllist; size_t len; _enter(""); if (!rootcell) { /* module is loaded with no parameters, or built statically. * - in the future we might initialize cell DB here. */ _leave(" = 0 [no root]"); return 0; } cp = strchr(rootcell, ':'); if (!cp) { _debug("kAFS: no VL server IP addresses specified"); vllist = NULL; len = strlen(rootcell); } else { vllist = cp + 1; len = cp - rootcell; } if (len == 0 || !rootcell[0] || rootcell[0] == '.' || rootcell[len - 1] == '.') return -EINVAL; if (memchr(rootcell, '/', len)) return -EINVAL; cp = strstr(rootcell, ".."); if (cp && cp < rootcell + len) return -EINVAL; /* allocate a cell record for the root/workstation cell */ new_root = afs_lookup_cell(net, rootcell, len, vllist, AFS_LOOKUP_CELL_ROOTCELL, afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); return PTR_ERR(new_root); } if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags)) afs_use_cell(new_root, afs_cell_trace_use_pin); /* install the new cell */ down_write(&net->cells_lock); old_root = rcu_replace_pointer(net->ws_cell, new_root, lockdep_is_held(&net->cells_lock)); up_write(&net->cells_lock); afs_unuse_cell(old_root, afs_cell_trace_unuse_ws); _leave(" = 0"); return 0; } /* * Update a cell's VL server address list from the DNS. */ static int afs_update_cell(struct afs_cell *cell) { struct afs_vlserver_list *vllist, *old = NULL, *p; unsigned int min_ttl = READ_ONCE(afs_cell_min_ttl); unsigned int max_ttl = READ_ONCE(afs_cell_max_ttl); time64_t now, expiry = 0; int ret = 0; _enter("%s", cell->name); vllist = afs_dns_query(cell, &expiry); if (IS_ERR(vllist)) { ret = PTR_ERR(vllist); _debug("%s: fail %d", cell->name, ret); if (ret == -ENOMEM) goto out_wake; vllist = afs_alloc_vlserver_list(0); if (!vllist) { if (ret >= 0) ret = -ENOMEM; goto out_wake; } switch (ret) { case -ENODATA: case -EDESTADDRREQ: vllist->status = DNS_LOOKUP_GOT_NOT_FOUND; break; case -EAGAIN: case -ECONNREFUSED: vllist->status = DNS_LOOKUP_GOT_TEMP_FAILURE; break; default: vllist->status = DNS_LOOKUP_GOT_LOCAL_FAILURE; break; } } _debug("%s: got list %d %d", cell->name, vllist->source, vllist->status); cell->dns_status = vllist->status; now = ktime_get_real_seconds(); if (min_ttl > max_ttl) max_ttl = min_ttl; if (expiry < now + min_ttl) expiry = now + min_ttl; else if (expiry > now + max_ttl) expiry = now + max_ttl; _debug("%s: status %d", cell->name, vllist->status); if (vllist->source == DNS_RECORD_UNAVAILABLE) { switch (vllist->status) { case DNS_LOOKUP_GOT_NOT_FOUND: /* The DNS said that the cell does not exist or there * weren't any addresses to be had. */ cell->dns_expiry = expiry; break; case DNS_LOOKUP_BAD: case DNS_LOOKUP_GOT_LOCAL_FAILURE: case DNS_LOOKUP_GOT_TEMP_FAILURE: case DNS_LOOKUP_GOT_NS_FAILURE: default: cell->dns_expiry = now + 10; break; } } else { cell->dns_expiry = expiry; } /* Replace the VL server list if the new record has servers or the old * record doesn't. */ write_lock(&cell->vl_servers_lock); p = rcu_dereference_protected(cell->vl_servers, true); if (vllist->nr_servers > 0 || p->nr_servers == 0) { rcu_assign_pointer(cell->vl_servers, vllist); cell->dns_source = vllist->source; old = p; } write_unlock(&cell->vl_servers_lock); afs_put_vlserverlist(cell->net, old); out_wake: smp_store_release(&cell->dns_lookup_count, cell->dns_lookup_count + 1); /* vs source/status */ wake_up_var(&cell->dns_lookup_count); _leave(" = %d", ret); return ret; } /* * Destroy a cell record */ static void afs_cell_destroy(struct rcu_head *rcu) { struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu); struct afs_net *net = cell->net; int r; _enter("%p{%s}", cell, cell->name); r = refcount_read(&cell->ref); ASSERTCMP(r, ==, 0); trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free); afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias); key_put(cell->anonymous_key); idr_remove(&net->cells_dyn_ino, cell->dynroot_ino); kfree(cell->name - 1); kfree(cell); afs_dec_cells_outstanding(net); _leave(" [destroyed]"); } static void afs_destroy_cell_work(struct work_struct *work) { struct afs_cell *cell = container_of(work, struct afs_cell, destroyer); afs_see_cell(cell, afs_cell_trace_destroy); timer_delete_sync(&cell->management_timer); cancel_work_sync(&cell->manager); call_rcu(&cell->rcu, afs_cell_destroy); } /* * Get a reference on a cell record. */ struct afs_cell *afs_get_cell(struct afs_cell *cell, enum afs_cell_trace reason) { int r; __refcount_inc(&cell->ref, &r); trace_afs_cell(cell->debug_id, r + 1, atomic_read(&cell->active), reason); return cell; } /* * Drop a reference on a cell record. */ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason) { if (cell) { unsigned int debug_id = cell->debug_id; unsigned int a; bool zero; int r; a = atomic_read(&cell->active); zero = __refcount_dec_and_test(&cell->ref, &r); trace_afs_cell(debug_id, r - 1, a, reason); if (zero) { a = atomic_read(&cell->active); WARN(a != 0, "Cell active count %u > 0\n", a); WARN_ON(!queue_work(afs_wq, &cell->destroyer)); } } } /* * Note a cell becoming more active. */ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) { int r, a; __refcount_inc(&cell->ref, &r); a = atomic_inc_return(&cell->active); trace_afs_cell(cell->debug_id, r + 1, a, reason); return cell; } /* * Record a cell becoming less active. When the active counter reaches 1, it * is scheduled for destruction, but may get reactivated. */ void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason) { unsigned int debug_id; time64_t now, expire_delay; bool zero; int r, a; if (!cell) return; _enter("%s", cell->name); now = ktime_get_real_seconds(); cell->last_inactive = now; expire_delay = 0; if (cell->vl_servers->nr_servers) expire_delay = afs_cell_gc_delay; debug_id = cell->debug_id; a = atomic_dec_return(&cell->active); if (!a) /* 'cell' may now be garbage collected. */ afs_set_cell_timer(cell, expire_delay); zero = __refcount_dec_and_test(&cell->ref, &r); trace_afs_cell(debug_id, r - 1, a, reason); if (zero) WARN_ON(!queue_work(afs_wq, &cell->destroyer)); } /* * Note that a cell has been seen. */ void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason) { int r, a; r = refcount_read(&cell->ref); a = atomic_read(&cell->active); trace_afs_cell(cell->debug_id, r, a, reason); } /* * Queue a cell for management, giving the workqueue a ref to hold. */ void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason) { queue_work(afs_wq, &cell->manager); } /* * Cell-specific management timer. */ static void afs_cell_timer(struct timer_list *timer) { struct afs_cell *cell = container_of(timer, struct afs_cell, management_timer); afs_see_cell(cell, afs_cell_trace_see_mgmt_timer); if (refcount_read(&cell->ref) > 0 && cell->net->live) queue_work(afs_wq, &cell->manager); } /* * Set/reduce the cell timer. */ void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs) { timer_reduce(&cell->management_timer, jiffies + delay_secs * HZ); } /* * Activate a cell. */ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) { struct hlist_node **p; struct afs_cell *pcell; int ret; ret = afs_proc_cell_setup(cell); if (ret < 0) return ret; mutex_lock(&net->proc_cells_lock); for (p = &net->proc_cells.first; *p; p = &(*p)->next) { pcell = hlist_entry(*p, struct afs_cell, proc_link); if (strcmp(cell->name, pcell->name) < 0) break; } cell->proc_link.pprev = p; cell->proc_link.next = *p; rcu_assign_pointer(*p, &cell->proc_link.next); if (cell->proc_link.next) cell->proc_link.next->pprev = &cell->proc_link.next; mutex_unlock(&net->proc_cells_lock); return 0; } /* * Deactivate a cell. */ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) { _enter("%s", cell->name); afs_proc_cell_remove(cell); mutex_lock(&net->proc_cells_lock); if (!hlist_unhashed(&cell->proc_link)) hlist_del_rcu(&cell->proc_link); mutex_unlock(&net->proc_cells_lock); _leave(""); } static bool afs_has_cell_expired(struct afs_cell *cell, time64_t *_next_manage) { const struct afs_vlserver_list *vllist; time64_t expire_at = cell->last_inactive; time64_t now = ktime_get_real_seconds(); if (atomic_read(&cell->active)) return false; if (!cell->net->live) return true; vllist = rcu_dereference_protected(cell->vl_servers, true); if (vllist && vllist->nr_servers > 0) expire_at += afs_cell_gc_delay; if (expire_at <= now) return true; if (expire_at < *_next_manage) *_next_manage = expire_at; return false; } /* * Manage a cell record, initialising and destroying it, maintaining its DNS * records. */ static bool afs_manage_cell(struct afs_cell *cell) { struct afs_net *net = cell->net; time64_t next_manage = TIME64_MAX; int ret; _enter("%s", cell->name); _debug("state %u", cell->state); switch (cell->state) { case AFS_CELL_SETTING_UP: goto set_up_cell; case AFS_CELL_UNLOOKED: case AFS_CELL_ACTIVE: goto cell_is_active; case AFS_CELL_REMOVING: WARN_ON_ONCE(1); return false; case AFS_CELL_DEAD: return false; default: _debug("bad state %u", cell->state); WARN_ON_ONCE(1); /* Unhandled state */ return false; } set_up_cell: ret = afs_activate_cell(net, cell); if (ret < 0) { cell->error = ret; goto remove_cell; } afs_set_cell_state(cell, AFS_CELL_UNLOOKED); cell_is_active: if (afs_has_cell_expired(cell, &next_manage)) goto remove_cell; if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { ret = afs_update_cell(cell); if (ret < 0) cell->error = ret; if (cell->state == AFS_CELL_UNLOOKED) afs_set_cell_state(cell, AFS_CELL_ACTIVE); } if (next_manage < TIME64_MAX && cell->net->live) { time64_t now = ktime_get_real_seconds(); if (next_manage - now <= 0) afs_queue_cell(cell, afs_cell_trace_queue_again); else afs_set_cell_timer(cell, next_manage - now); } _leave(" [done %u]", cell->state); return false; remove_cell: down_write(&net->cells_lock); if (atomic_read(&cell->active)) { up_write(&net->cells_lock); goto cell_is_active; } /* Make sure that the expiring server records are going to see the fact * that the cell is caput. */ afs_set_cell_state(cell, AFS_CELL_REMOVING); afs_deactivate_cell(net, cell); afs_purge_servers(cell); rb_erase(&cell->net_node, &net->cells); afs_see_cell(cell, afs_cell_trace_unuse_delete); up_write(&net->cells_lock); /* The root volume is pinning the cell */ afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root); cell->root_volume = NULL; afs_set_cell_state(cell, AFS_CELL_DEAD); return true; } static void afs_manage_cell_work(struct work_struct *work) { struct afs_cell *cell = container_of(work, struct afs_cell, manager); bool final_put; afs_see_cell(cell, afs_cell_trace_manage); final_put = afs_manage_cell(cell); afs_see_cell(cell, afs_cell_trace_managed); if (final_put) afs_put_cell(cell, afs_cell_trace_put_final); } /* * Purge in-memory cell database. */ void afs_cell_purge(struct afs_net *net) { struct afs_cell *ws; struct rb_node *cursor; _enter(""); down_write(&net->cells_lock); ws = rcu_replace_pointer(net->ws_cell, NULL, lockdep_is_held(&net->cells_lock)); up_write(&net->cells_lock); afs_unuse_cell(ws, afs_cell_trace_unuse_ws); _debug("kick cells"); down_read(&net->cells_lock); for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node); afs_see_cell(cell, afs_cell_trace_purge); if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) afs_unuse_cell(cell, afs_cell_trace_unuse_pin); afs_queue_cell(cell, afs_cell_trace_queue_purge); } up_read(&net->cells_lock); _debug("wait"); wait_var_event(&net->cells_outstanding, !atomic_read(&net->cells_outstanding)); _leave(""); } |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 | /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/debugfs.h> #include <linux/scatterlist.h> #include <crypto/aes.h> #include <crypto/hash.h> #include <crypto/kpp.h> #include <crypto/utils.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "ecdh_helper.h" #include "smp.h" #define SMP_DEV(hdev) \ ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data) /* Low-level debug macros to be used for stuff that we don't want * accidentally in dmesg, i.e. the values of the various crypto keys * and the inputs & outputs of crypto functions. */ #ifdef DEBUG #define SMP_DBG(fmt, ...) printk(KERN_DEBUG "%s: " fmt, __func__, \ ##__VA_ARGS__) #else #define SMP_DBG(fmt, ...) no_printk(KERN_DEBUG "%s: " fmt, __func__, \ ##__VA_ARGS__) #endif #define SMP_ALLOW_CMD(smp, code) set_bit(code, &smp->allow_cmd) /* Keys which are not distributed with Secure Connections */ #define SMP_SC_NO_DIST (SMP_DIST_ENC_KEY | SMP_DIST_LINK_KEY) #define SMP_TIMEOUT secs_to_jiffies(30) #define ID_ADDR_TIMEOUT msecs_to_jiffies(200) #define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \ 0x3f : 0x07) #define KEY_DIST_MASK 0x07 /* Maximum message length that can be passed to aes_cmac */ #define CMAC_MSG_MAX 80 enum { SMP_FLAG_TK_VALID, SMP_FLAG_CFM_PENDING, SMP_FLAG_MITM_AUTH, SMP_FLAG_COMPLETE, SMP_FLAG_INITIATOR, SMP_FLAG_SC, SMP_FLAG_REMOTE_PK, SMP_FLAG_DEBUG_KEY, SMP_FLAG_WAIT_USER, SMP_FLAG_DHKEY_PENDING, SMP_FLAG_REMOTE_OOB, SMP_FLAG_LOCAL_OOB, SMP_FLAG_CT2, }; struct smp_dev { /* Secure Connections OOB data */ bool local_oob; u8 local_pk[64]; u8 local_rand[16]; bool debug_key; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; }; struct smp_chan { struct l2cap_conn *conn; struct delayed_work security_timer; unsigned long allow_cmd; /* Bitmask of allowed commands */ u8 preq[7]; /* SMP Pairing Request */ u8 prsp[7]; /* SMP Pairing Response */ u8 prnd[16]; /* SMP Pairing Random (local) */ u8 rrnd[16]; /* SMP Pairing Random (remote) */ u8 pcnf[16]; /* SMP Pairing Confirm */ u8 tk[16]; /* SMP Temporary Key */ u8 rr[16]; /* Remote OOB ra/rb value */ u8 lr[16]; /* Local OOB ra/rb value */ u8 enc_key_size; u8 remote_key_dist; bdaddr_t id_addr; u8 id_addr_type; u8 irk[16]; struct smp_csrk *csrk; struct smp_csrk *responder_csrk; struct smp_ltk *ltk; struct smp_ltk *responder_ltk; struct smp_irk *remote_irk; u8 *link_key; unsigned long flags; u8 method; u8 passkey_round; /* Secure Connections variables */ u8 local_pk[64]; u8 remote_pk[64]; u8 dhkey[32]; u8 mackey[16]; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; }; /* These debug key values are defined in the SMP section of the core * specification. debug_pk is the public debug key and debug_sk the * private debug key. */ static const u8 debug_pk[64] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef, 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e, 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20, 0x8b, 0xd2, 0x89, 0x15, 0xd0, 0x8e, 0x1c, 0x74, 0x24, 0x30, 0xed, 0x8f, 0xc2, 0x45, 0x63, 0x76, 0x5c, 0x15, 0x52, 0x5a, 0xbf, 0x9a, 0x32, 0x63, 0x6d, 0xeb, 0x2a, 0x65, 0x49, 0x9c, 0x80, 0xdc, }; static const u8 debug_sk[32] = { 0xbd, 0x1a, 0x3c, 0xcd, 0xa6, 0xb8, 0x99, 0x58, 0x99, 0xb7, 0x40, 0xeb, 0x7b, 0x60, 0xff, 0x4a, 0x50, 0x3f, 0x10, 0xd2, 0xe3, 0xb3, 0xc9, 0x74, 0x38, 0x5f, 0xc5, 0xa3, 0xd4, 0xf6, 0x49, 0x3f, }; static inline void swap_buf(const u8 *src, u8 *dst, size_t len) { size_t i; for (i = 0; i < len; i++) dst[len - 1 - i] = src[i]; } /* The following functions map to the LE SC SMP crypto functions * AES-CMAC, f4, f5, f6, g2 and h6. */ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m, size_t len, u8 mac[16]) { uint8_t tmp[16], mac_msb[16], msg_msb[CMAC_MSG_MAX]; int err; if (len > CMAC_MSG_MAX) return -EFBIG; if (!tfm) { BT_ERR("tfm %p", tfm); return -EINVAL; } /* Swap key and message from LSB to MSB */ swap_buf(k, tmp, 16); swap_buf(m, msg_msb, len); SMP_DBG("msg (len %zu) %*phN", len, (int) len, m); SMP_DBG("key %16phN", k); err = crypto_shash_setkey(tfm, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; } err = crypto_shash_tfm_digest(tfm, msg_msb, len, mac_msb); if (err) { BT_ERR("Hash computation error %d", err); return err; } swap_buf(mac_msb, mac, 16); SMP_DBG("mac %16phN", mac); return 0; } static int smp_f4(struct crypto_shash *tfm_cmac, const u8 u[32], const u8 v[32], const u8 x[16], u8 z, u8 res[16]) { u8 m[65]; int err; SMP_DBG("u %32phN", u); SMP_DBG("v %32phN", v); SMP_DBG("x %16phN z %02x", x, z); m[0] = z; memcpy(m + 1, v, 32); memcpy(m + 33, u, 32); err = aes_cmac(tfm_cmac, x, m, sizeof(m), res); if (err) return err; SMP_DBG("res %16phN", res); return err; } static int smp_f5(struct crypto_shash *tfm_cmac, const u8 w[32], const u8 n1[16], const u8 n2[16], const u8 a1[7], const u8 a2[7], u8 mackey[16], u8 ltk[16]) { /* The btle, salt and length "magic" values are as defined in * the SMP section of the Bluetooth core specification. In ASCII * the btle value ends up being 'btle'. The salt is just a * random number whereas length is the value 256 in little * endian format. */ const u8 btle[4] = { 0x65, 0x6c, 0x74, 0x62 }; const u8 salt[16] = { 0xbe, 0x83, 0x60, 0x5a, 0xdb, 0x0b, 0x37, 0x60, 0x38, 0xa5, 0xf5, 0xaa, 0x91, 0x83, 0x88, 0x6c }; const u8 length[2] = { 0x00, 0x01 }; u8 m[53], t[16]; int err; SMP_DBG("w %32phN", w); SMP_DBG("n1 %16phN n2 %16phN", n1, n2); SMP_DBG("a1 %7phN a2 %7phN", a1, a2); err = aes_cmac(tfm_cmac, salt, w, 32, t); if (err) return err; SMP_DBG("t %16phN", t); memcpy(m, length, 2); memcpy(m + 2, a2, 7); memcpy(m + 9, a1, 7); memcpy(m + 16, n2, 16); memcpy(m + 32, n1, 16); memcpy(m + 48, btle, 4); m[52] = 0; /* Counter */ err = aes_cmac(tfm_cmac, t, m, sizeof(m), mackey); if (err) return err; SMP_DBG("mackey %16phN", mackey); m[52] = 1; /* Counter */ err = aes_cmac(tfm_cmac, t, m, sizeof(m), ltk); if (err) return err; SMP_DBG("ltk %16phN", ltk); return 0; } static int smp_f6(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 n1[16], const u8 n2[16], const u8 r[16], const u8 io_cap[3], const u8 a1[7], const u8 a2[7], u8 res[16]) { u8 m[65]; int err; SMP_DBG("w %16phN", w); SMP_DBG("n1 %16phN n2 %16phN", n1, n2); SMP_DBG("r %16phN io_cap %3phN a1 %7phN a2 %7phN", r, io_cap, a1, a2); memcpy(m, a2, 7); memcpy(m + 7, a1, 7); memcpy(m + 14, io_cap, 3); memcpy(m + 17, r, 16); memcpy(m + 33, n2, 16); memcpy(m + 49, n1, 16); err = aes_cmac(tfm_cmac, w, m, sizeof(m), res); if (err) return err; SMP_DBG("res %16phN", res); return err; } static int smp_g2(struct crypto_shash *tfm_cmac, const u8 u[32], const u8 v[32], const u8 x[16], const u8 y[16], u32 *val) { u8 m[80], tmp[16]; int err; SMP_DBG("u %32phN", u); SMP_DBG("v %32phN", v); SMP_DBG("x %16phN y %16phN", x, y); memcpy(m, y, 16); memcpy(m + 16, v, 32); memcpy(m + 48, u, 32); err = aes_cmac(tfm_cmac, x, m, sizeof(m), tmp); if (err) return err; *val = get_unaligned_le32(tmp); *val %= 1000000; SMP_DBG("val %06u", *val); return 0; } static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 key_id[4], u8 res[16]) { int err; SMP_DBG("w %16phN key_id %4phN", w, key_id); err = aes_cmac(tfm_cmac, w, key_id, 4, res); if (err) return err; SMP_DBG("res %16phN", res); return err; } static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 salt[16], u8 res[16]) { int err; SMP_DBG("w %16phN salt %16phN", w, salt); err = aes_cmac(tfm_cmac, salt, w, 16, res); if (err) return err; SMP_DBG("res %16phN", res); return err; } /* The following functions map to the legacy SMP crypto functions e, c1, * s1 and ah. */ static int smp_e(const u8 *k, u8 *r) { struct crypto_aes_ctx ctx; uint8_t tmp[16], data[16]; int err; SMP_DBG("k %16phN r %16phN", k, r); /* The most significant octet of key corresponds to k[0] */ swap_buf(k, tmp, 16); err = aes_expandkey(&ctx, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; } /* Most significant octet of plaintextData corresponds to data[0] */ swap_buf(r, data, 16); aes_encrypt(&ctx, data, data); /* Most significant octet of encryptedData corresponds to data[0] */ swap_buf(data, r, 16); SMP_DBG("r %16phN", r); memzero_explicit(&ctx, sizeof(ctx)); return err; } static int smp_c1(const u8 k[16], const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat, const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16]) { u8 p1[16], p2[16]; int err; SMP_DBG("k %16phN r %16phN", k, r); SMP_DBG("iat %u ia %6phN rat %u ra %6phN", _iat, ia, _rat, ra); SMP_DBG("preq %7phN pres %7phN", preq, pres); memset(p1, 0, 16); /* p1 = pres || preq || _rat || _iat */ p1[0] = _iat; p1[1] = _rat; memcpy(p1 + 2, preq, 7); memcpy(p1 + 9, pres, 7); SMP_DBG("p1 %16phN", p1); /* res = r XOR p1 */ crypto_xor_cpy(res, r, p1, sizeof(p1)); /* res = e(k, res) */ err = smp_e(k, res); if (err) { BT_ERR("Encrypt data error"); return err; } /* p2 = padding || ia || ra */ memcpy(p2, ra, 6); memcpy(p2 + 6, ia, 6); memset(p2 + 12, 0, 4); SMP_DBG("p2 %16phN", p2); /* res = res XOR p2 */ crypto_xor(res, p2, sizeof(p2)); /* res = e(k, res) */ err = smp_e(k, res); if (err) BT_ERR("Encrypt data error"); return err; } static int smp_s1(const u8 k[16], const u8 r1[16], const u8 r2[16], u8 _r[16]) { int err; /* Just least significant octets from r1 and r2 are considered */ memcpy(_r, r2, 8); memcpy(_r + 8, r1, 8); err = smp_e(k, _r); if (err) BT_ERR("Encrypt data error"); return err; } static int smp_ah(const u8 irk[16], const u8 r[3], u8 res[3]) { u8 _res[16]; int err; /* r' = padding || r */ memcpy(_res, r, 3); memset(_res + 3, 0, 13); err = smp_e(irk, _res); if (err) { BT_ERR("Encrypt error"); return err; } /* The output of the random address function ah is: * ah(k, r) = e(k, r') mod 2^24 * The output of the security function e is then truncated to 24 bits * by taking the least significant 24 bits of the output of e as the * result of ah. */ memcpy(res, _res, 3); return 0; } bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], const bdaddr_t *bdaddr) { struct l2cap_chan *chan = hdev->smp_data; u8 hash[3]; int err; if (!chan || !chan->data) return false; bt_dev_dbg(hdev, "RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(irk, &bdaddr->b[3], hash); if (err) return false; return !crypto_memneq(bdaddr->b, hash, 3); } int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) { struct l2cap_chan *chan = hdev->smp_data; int err; if (!chan || !chan->data) return -EOPNOTSUPP; get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ rpa->b[5] |= 0x40; /* Set second most significant bit */ err = smp_ah(irk, &rpa->b[3], rpa->b); if (err < 0) return err; bt_dev_dbg(hdev, "RPA %pMR", rpa); return 0; } int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) { struct l2cap_chan *chan = hdev->smp_data; struct smp_dev *smp; int err; if (!chan || !chan->data) return -EOPNOTSUPP; smp = chan->data; if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { bt_dev_dbg(hdev, "Using debug keys"); err = set_ecdh_privkey(smp->tfm_ecdh, debug_sk); if (err) return err; memcpy(smp->local_pk, debug_pk, 64); smp->debug_key = true; } else { while (true) { /* Generate key pair for Secure Connections */ err = generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk); if (err) return err; /* This is unlikely, but we need to check that * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; } smp->debug_key = false; } SMP_DBG("OOB Public Key X: %32phN", smp->local_pk); SMP_DBG("OOB Public Key Y: %32phN", smp->local_pk + 32); get_random_bytes(smp->local_rand, 16); err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->local_pk, smp->local_rand, 0, hash); if (err < 0) return err; memcpy(rand, smp->local_rand, 16); smp->local_oob = true; return 0; } static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; struct kvec iv[2]; struct msghdr msg; if (!chan) return; bt_dev_dbg(conn->hcon->hdev, "code 0x%2.2x", code); iv[0].iov_base = &code; iv[0].iov_len = 1; iv[1].iov_base = data; iv[1].iov_len = len; memset(&msg, 0, sizeof(msg)); iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iv, 2, 1 + len); l2cap_chan_send(chan, &msg, 1 + len, NULL); if (!chan->data) return; smp = chan->data; cancel_delayed_work_sync(&smp->security_timer); schedule_delayed_work(&smp->security_timer, SMP_TIMEOUT); } static u8 authreq_to_seclevel(u8 authreq) { if (authreq & SMP_AUTH_MITM) { if (authreq & SMP_AUTH_SC) return BT_SECURITY_FIPS; else return BT_SECURITY_HIGH; } else { return BT_SECURITY_MEDIUM; } } static __u8 seclevel_to_authreq(__u8 sec_level) { switch (sec_level) { case BT_SECURITY_FIPS: case BT_SECURITY_HIGH: return SMP_AUTH_MITM | SMP_AUTH_BONDING; case BT_SECURITY_MEDIUM: return SMP_AUTH_BONDING; default: return SMP_AUTH_NONE; } } static void build_pairing_cmd(struct l2cap_conn *conn, struct smp_cmd_pairing *req, struct smp_cmd_pairing *rsp, __u8 authreq) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; u8 local_dist = 0, remote_dist = 0, oob_flag = SMP_OOB_NOT_PRESENT; if (hci_dev_test_flag(hdev, HCI_BONDABLE)) { local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; authreq |= SMP_AUTH_BONDING; } else { authreq &= ~SMP_AUTH_BONDING; } if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING)) remote_dist |= SMP_DIST_ID_KEY; if (hci_dev_test_flag(hdev, HCI_PRIVACY)) local_dist |= SMP_DIST_ID_KEY; if (hci_dev_test_flag(hdev, HCI_SC_ENABLED) && (authreq & SMP_AUTH_SC)) { struct oob_data *oob_data; u8 bdaddr_type; if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { local_dist |= SMP_DIST_LINK_KEY; remote_dist |= SMP_DIST_LINK_KEY; } if (hcon->dst_type == ADDR_LE_DEV_PUBLIC) bdaddr_type = BDADDR_LE_PUBLIC; else bdaddr_type = BDADDR_LE_RANDOM; oob_data = hci_find_remote_oob_data(hdev, &hcon->dst, bdaddr_type); if (oob_data && oob_data->present) { set_bit(SMP_FLAG_REMOTE_OOB, &smp->flags); oob_flag = SMP_OOB_PRESENT; memcpy(smp->rr, oob_data->rand256, 16); memcpy(smp->pcnf, oob_data->hash256, 16); SMP_DBG("OOB Remote Confirmation: %16phN", smp->pcnf); SMP_DBG("OOB Remote Random: %16phN", smp->rr); } } else { authreq &= ~SMP_AUTH_SC; } if (rsp == NULL) { req->io_capability = conn->hcon->io_capability; req->oob_flag = oob_flag; req->max_key_size = hdev->le_max_key_size; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->auth_req = (authreq & AUTH_REQ_MASK(hdev)); smp->remote_key_dist = remote_dist; return; } rsp->io_capability = conn->hcon->io_capability; rsp->oob_flag = oob_flag; rsp->max_key_size = hdev->le_max_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev)); smp->remote_key_dist = rsp->init_key_dist; } static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) { struct l2cap_chan *chan = conn->smp; struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; if (conn->hcon->pending_sec_level == BT_SECURITY_FIPS && max_key_size != SMP_MAX_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; if (max_key_size > hdev->le_max_key_size || max_key_size < SMP_MIN_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; smp->enc_key_size = max_key_size; return 0; } static void smp_chan_destroy(struct l2cap_conn *conn) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; bool complete; BUG_ON(!smp); cancel_delayed_work_sync(&smp->security_timer); complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); mgmt_smp_complete(hcon, complete); kfree_sensitive(smp->csrk); kfree_sensitive(smp->responder_csrk); kfree_sensitive(smp->link_key); crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); /* Ensure that we don't leave any debug key around if debug key * support hasn't been explicitly enabled. */ if (smp->ltk && smp->ltk->type == SMP_LTK_P256_DEBUG && !hci_dev_test_flag(hcon->hdev, HCI_KEEP_DEBUG_KEYS)) { list_del_rcu(&smp->ltk->list); kfree_rcu(smp->ltk, rcu); smp->ltk = NULL; } /* If pairing failed clean up any keys we might have */ if (!complete) { if (smp->ltk) { list_del_rcu(&smp->ltk->list); kfree_rcu(smp->ltk, rcu); } if (smp->responder_ltk) { list_del_rcu(&smp->responder_ltk->list); kfree_rcu(smp->responder_ltk, rcu); } if (smp->remote_irk) { list_del_rcu(&smp->remote_irk->list); kfree_rcu(smp->remote_irk, rcu); } } chan->data = NULL; kfree_sensitive(smp); hci_conn_drop(hcon); } static void smp_failure(struct l2cap_conn *conn, u8 reason) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; if (reason) smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), &reason); mgmt_auth_failed(hcon, HCI_ERROR_AUTH_FAILURE); if (chan->data) smp_chan_destroy(conn); } #define JUST_WORKS 0x00 #define JUST_CFM 0x01 #define REQ_PASSKEY 0x02 #define CFM_PASSKEY 0x03 #define REQ_OOB 0x04 #define DSP_PASSKEY 0x05 #define OVERLAP 0xFF static const u8 gen_method[5][5] = { { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY }, { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY }, { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY }, { JUST_WORKS, JUST_CFM, JUST_WORKS, JUST_WORKS, JUST_CFM }, { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, OVERLAP }, }; static const u8 sc_method[5][5] = { { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY }, { JUST_WORKS, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY }, { DSP_PASSKEY, DSP_PASSKEY, REQ_PASSKEY, JUST_WORKS, DSP_PASSKEY }, { JUST_WORKS, JUST_CFM, JUST_WORKS, JUST_WORKS, JUST_CFM }, { DSP_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY }, }; static u8 get_auth_method(struct smp_chan *smp, u8 local_io, u8 remote_io) { /* If either side has unknown io_caps, use JUST_CFM (which gets * converted later to JUST_WORKS if we're initiators. */ if (local_io > SMP_IO_KEYBOARD_DISPLAY || remote_io > SMP_IO_KEYBOARD_DISPLAY) return JUST_CFM; if (test_bit(SMP_FLAG_SC, &smp->flags)) return sc_method[remote_io][local_io]; return gen_method[remote_io][local_io]; } static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, u8 local_io, u8 remote_io) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; u32 passkey = 0; int ret; /* Initialize key for JUST WORKS */ memset(smp->tk, 0, sizeof(smp->tk)); clear_bit(SMP_FLAG_TK_VALID, &smp->flags); bt_dev_dbg(hcon->hdev, "auth:%u lcl:%u rem:%u", auth, local_io, remote_io); /* If neither side wants MITM, either "just" confirm an incoming * request or use just-works for outgoing ones. The JUST_CFM * will be converted to JUST_WORKS if necessary later in this * function. If either side has MITM look up the method from the * table. */ if (!(auth & SMP_AUTH_MITM)) smp->method = JUST_CFM; else smp->method = get_auth_method(smp, local_io, remote_io); /* Don't confirm locally initiated pairing attempts */ if (smp->method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp->method = JUST_WORKS; /* Don't bother user space with no IO capabilities */ if (smp->method == JUST_CFM && hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) smp->method = JUST_WORKS; /* If Just Works, Continue with Zero TK and ask user-space for * confirmation */ if (smp->method == JUST_WORKS) { ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 1); if (ret) return ret; set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } /* If this function is used for SC -> legacy fallback we * can only recover the just-works case. */ if (test_bit(SMP_FLAG_SC, &smp->flags)) return -EINVAL; /* Not Just Works/Confirm results in MITM Authentication */ if (smp->method != JUST_CFM) { set_bit(SMP_FLAG_MITM_AUTH, &smp->flags); if (hcon->pending_sec_level < BT_SECURITY_HIGH) hcon->pending_sec_level = BT_SECURITY_HIGH; } /* If both devices have Keyboard-Display I/O, the initiator * Confirms and the responder Enters the passkey. */ if (smp->method == OVERLAP) { if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp->method = CFM_PASSKEY; else smp->method = REQ_PASSKEY; } /* Generate random passkey. */ if (smp->method == CFM_PASSKEY) { memset(smp->tk, 0, sizeof(smp->tk)); get_random_bytes(&passkey, sizeof(passkey)); passkey %= 1000000; put_unaligned_le32(passkey, smp->tk); bt_dev_dbg(hcon->hdev, "PassKey: %u", passkey); set_bit(SMP_FLAG_TK_VALID, &smp->flags); } if (smp->method == REQ_PASSKEY) ret = mgmt_user_passkey_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type); else if (smp->method == JUST_CFM) ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 1); else ret = mgmt_user_passkey_notify(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 0); return ret; } static u8 smp_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct smp_cmd_pairing_confirm cp; int ret; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); ret = smp_c1(smp->tk, smp->prnd, smp->preq, smp->prsp, conn->hcon->init_addr_type, &conn->hcon->init_addr, conn->hcon->resp_addr_type, &conn->hcon->resp_addr, cp.confirm_val); if (ret) return SMP_UNSPECIFIED; clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags); smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); else SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } static u8 smp_random(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; u8 confirm[16]; int ret; bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn, test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" : "responder"); ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp, hcon->init_addr_type, &hcon->init_addr, hcon->resp_addr_type, &hcon->resp_addr, confirm); if (ret) return SMP_UNSPECIFIED; if (crypto_memneq(smp->pcnf, confirm, sizeof(smp->pcnf))) { bt_dev_err(hcon->hdev, "pairing failed " "(confirmation values mismatch)"); return SMP_CONFIRM_FAILED; } if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { u8 stk[16]; __le64 rand = 0; __le16 ediv = 0; smp_s1(smp->tk, smp->rrnd, smp->prnd, stk); if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return SMP_UNSPECIFIED; hci_le_start_enc(hcon, ediv, rand, stk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; set_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); } else { u8 stk[16], auth; __le64 rand = 0; __le16 ediv = 0; smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); smp_s1(smp->tk, smp->prnd, smp->rrnd, stk); if (hcon->pending_sec_level == BT_SECURITY_HIGH) auth = 1; else auth = 0; /* Even though there's no _RESPONDER suffix this is the * responder STK we're adding for later lookup (the initiator * STK never needs to be stored). */ hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, SMP_STK, auth, stk, smp->enc_key_size, ediv, rand); } return 0; } static void smp_notify_keys(struct l2cap_conn *conn) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_cmd_pairing *req = (void *) &smp->preq[1]; struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1]; bool persistent; if (hcon->type == ACL_LINK) { if (hcon->key_type == HCI_LK_DEBUG_COMBINATION) persistent = false; else persistent = !test_bit(HCI_CONN_FLUSH_KEY, &hcon->flags); } else { /* The LTKs, IRKs and CSRKs should be persistent only if * both sides had the bonding bit set in their * authentication requests. */ persistent = !!((req->auth_req & rsp->auth_req) & SMP_AUTH_BONDING); } if (smp->remote_irk) { mgmt_new_irk(hdev, smp->remote_irk, persistent); /* Now that user space can be considered to know the * identity address track the connection based on it * from now on (assuming this is an LE link). */ if (hcon->type == LE_LINK) { bacpy(&hcon->dst, &smp->remote_irk->bdaddr); hcon->dst_type = smp->remote_irk->addr_type; /* Use a short delay to make sure the new address is * propagated _before_ the channels. */ queue_delayed_work(hdev->workqueue, &conn->id_addr_timer, ID_ADDR_TIMEOUT); } } if (smp->csrk) { smp->csrk->bdaddr_type = hcon->dst_type; bacpy(&smp->csrk->bdaddr, &hcon->dst); mgmt_new_csrk(hdev, smp->csrk, persistent); } if (smp->responder_csrk) { smp->responder_csrk->bdaddr_type = hcon->dst_type; bacpy(&smp->responder_csrk->bdaddr, &hcon->dst); mgmt_new_csrk(hdev, smp->responder_csrk, persistent); } if (smp->ltk) { smp->ltk->bdaddr_type = hcon->dst_type; bacpy(&smp->ltk->bdaddr, &hcon->dst); mgmt_new_ltk(hdev, smp->ltk, persistent); } if (smp->responder_ltk) { smp->responder_ltk->bdaddr_type = hcon->dst_type; bacpy(&smp->responder_ltk->bdaddr, &hcon->dst); mgmt_new_ltk(hdev, smp->responder_ltk, persistent); } if (smp->link_key) { struct link_key *key; u8 type; if (test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags)) type = HCI_LK_DEBUG_COMBINATION; else if (hcon->sec_level == BT_SECURITY_FIPS) type = HCI_LK_AUTH_COMBINATION_P256; else type = HCI_LK_UNAUTH_COMBINATION_P256; key = hci_add_link_key(hdev, smp->conn->hcon, &hcon->dst, smp->link_key, type, 0, &persistent); if (key) { mgmt_new_link_key(hdev, key, persistent); /* Don't keep debug keys around if the relevant * flag is not set. */ if (!hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS) && key->type == HCI_LK_DEBUG_COMBINATION) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } } } } static void sc_add_ltk(struct smp_chan *smp) { struct hci_conn *hcon = smp->conn->hcon; u8 key_type, auth; if (test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags)) key_type = SMP_LTK_P256_DEBUG; else key_type = SMP_LTK_P256; if (hcon->pending_sec_level == BT_SECURITY_FIPS) auth = 1; else auth = 0; smp->ltk = hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, key_type, auth, smp->tk, smp->enc_key_size, 0, 0); } static void sc_generate_link_key(struct smp_chan *smp) { /* From core spec. Spells out in ASCII as 'lebr'. */ const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c }; smp->link_key = kzalloc(16, GFP_KERNEL); if (!smp->link_key) return; if (test_bit(SMP_FLAG_CT2, &smp->flags)) { /* SALT = 0x000000000000000000000000746D7031 */ const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 }; if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) { kfree_sensitive(smp->link_key); smp->link_key = NULL; return; } } else { /* From core spec. Spells out in ASCII as 'tmp1'. */ const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 }; if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) { kfree_sensitive(smp->link_key); smp->link_key = NULL; return; } } if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) { kfree_sensitive(smp->link_key); smp->link_key = NULL; return; } } static void smp_allow_key_dist(struct smp_chan *smp) { /* Allow the first expected phase 3 PDU. The rest of the PDUs * will be allowed in each PDU handler to ensure we receive * them in the correct order. */ if (smp->remote_key_dist & SMP_DIST_ENC_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_ENCRYPT_INFO); else if (smp->remote_key_dist & SMP_DIST_ID_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); else if (smp->remote_key_dist & SMP_DIST_SIGN) SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); } static void sc_generate_ltk(struct smp_chan *smp) { /* From core spec. Spells out in ASCII as 'brle'. */ const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 }; struct hci_conn *hcon = smp->conn->hcon; struct hci_dev *hdev = hcon->hdev; struct link_key *key; key = hci_find_link_key(hdev, &hcon->dst); if (!key) { bt_dev_err(hdev, "no Link Key found to generate LTK"); return; } if (key->type == HCI_LK_DEBUG_COMBINATION) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); if (test_bit(SMP_FLAG_CT2, &smp->flags)) { /* SALT = 0x000000000000000000000000746D7032 */ const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 }; if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk)) return; } else { /* From core spec. Spells out in ASCII as 'tmp2'. */ const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 }; if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk)) return; } if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk)) return; sc_add_ltk(smp); } static void smp_distribute_keys(struct smp_chan *smp) { struct smp_cmd_pairing *req, *rsp; struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; __u8 *keydist; bt_dev_dbg(hdev, "conn %p", conn); rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags) && (smp->remote_key_dist & KEY_DIST_MASK)) { smp_allow_key_dist(smp); return; } req = (void *) &smp->preq[1]; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { keydist = &rsp->init_key_dist; *keydist &= req->init_key_dist; } else { keydist = &rsp->resp_key_dist; *keydist &= req->resp_key_dist; } if (test_bit(SMP_FLAG_SC, &smp->flags)) { if (hcon->type == LE_LINK && (*keydist & SMP_DIST_LINK_KEY)) sc_generate_link_key(smp); if (hcon->type == ACL_LINK && (*keydist & SMP_DIST_ENC_KEY)) sc_generate_ltk(smp); /* Clear the keys which are generated but not distributed */ *keydist &= ~SMP_SC_NO_DIST; } bt_dev_dbg(hdev, "keydist 0x%x", *keydist); if (*keydist & SMP_DIST_ENC_KEY) { struct smp_cmd_encrypt_info enc; struct smp_cmd_initiator_ident ident; struct smp_ltk *ltk; u8 authenticated; __le16 ediv; __le64 rand; /* Make sure we generate only the significant amount of * bytes based on the encryption key size, and set the rest * of the value to zeroes. */ get_random_bytes(enc.ltk, smp->enc_key_size); memset(enc.ltk + smp->enc_key_size, 0, sizeof(enc.ltk) - smp->enc_key_size); get_random_bytes(&ediv, sizeof(ediv)); get_random_bytes(&rand, sizeof(rand)); smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc); authenticated = hcon->sec_level == BT_SECURITY_HIGH; ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, SMP_LTK_RESPONDER, authenticated, enc.ltk, smp->enc_key_size, ediv, rand); smp->responder_ltk = ltk; ident.ediv = ediv; ident.rand = rand; smp_send_cmd(conn, SMP_CMD_INITIATOR_IDENT, sizeof(ident), &ident); *keydist &= ~SMP_DIST_ENC_KEY; } if (*keydist & SMP_DIST_ID_KEY) { struct smp_cmd_ident_addr_info addrinfo; struct smp_cmd_ident_info idinfo; memcpy(idinfo.irk, hdev->irk, sizeof(idinfo.irk)); smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo); /* The hci_conn contains the local identity address * after the connection has been established. * * This is true even when the connection has been * established using a resolvable random address. */ bacpy(&addrinfo.bdaddr, &hcon->src); addrinfo.addr_type = hcon->src_type; smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo), &addrinfo); *keydist &= ~SMP_DIST_ID_KEY; } if (*keydist & SMP_DIST_SIGN) { struct smp_cmd_sign_info sign; struct smp_csrk *csrk; /* Generate a new random key */ get_random_bytes(sign.csrk, sizeof(sign.csrk)); csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); if (csrk) { if (hcon->sec_level > BT_SECURITY_MEDIUM) csrk->type = MGMT_CSRK_LOCAL_AUTHENTICATED; else csrk->type = MGMT_CSRK_LOCAL_UNAUTHENTICATED; memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); } smp->responder_csrk = csrk; smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); *keydist &= ~SMP_DIST_SIGN; } /* If there are still keys to be received wait for them */ if (smp->remote_key_dist & KEY_DIST_MASK) { smp_allow_key_dist(smp); return; } set_bit(SMP_FLAG_COMPLETE, &smp->flags); smp_notify_keys(conn); smp_chan_destroy(conn); } static void smp_timeout(struct work_struct *work) { struct smp_chan *smp = container_of(work, struct smp_chan, security_timer.work); struct l2cap_conn *conn = smp->conn; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); hci_disconnect(conn->hcon, HCI_ERROR_AUTH_FAILURE); } static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; smp = kzalloc(sizeof(*smp), GFP_ATOMIC); if (!smp) return NULL; smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(smp->tfm_cmac)) { bt_dev_err(hcon->hdev, "Unable to create CMAC crypto context"); goto zfree_smp; } smp->tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0); if (IS_ERR(smp->tfm_ecdh)) { bt_dev_err(hcon->hdev, "Unable to create ECDH crypto context"); goto free_shash; } smp->conn = conn; chan->data = smp; SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_FAIL); INIT_DELAYED_WORK(&smp->security_timer, smp_timeout); hci_conn_hold(hcon); return smp; free_shash: crypto_free_shash(smp->tfm_cmac); zfree_smp: kfree_sensitive(smp); return NULL; } static int sc_mackey_and_ltk(struct smp_chan *smp, u8 mackey[16], u8 ltk[16]) { struct hci_conn *hcon = smp->conn->hcon; u8 *na, *nb, a[7], b[7]; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { na = smp->prnd; nb = smp->rrnd; } else { na = smp->rrnd; nb = smp->prnd; } memcpy(a, &hcon->init_addr, 6); memcpy(b, &hcon->resp_addr, 6); a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; return smp_f5(smp->tfm_cmac, smp->dhkey, na, nb, a, b, mackey, ltk); } static void sc_dhkey_check(struct smp_chan *smp) { struct hci_conn *hcon = smp->conn->hcon; struct smp_cmd_dhkey_check check; u8 a[7], b[7], *local_addr, *remote_addr; u8 io_cap[3], r[16]; memcpy(a, &hcon->init_addr, 6); memcpy(b, &hcon->resp_addr, 6); a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local_addr = a; remote_addr = b; memcpy(io_cap, &smp->preq[1], 3); } else { local_addr = b; remote_addr = a; memcpy(io_cap, &smp->prsp[1], 3); } memset(r, 0, sizeof(r)); if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) put_unaligned_le32(hcon->passkey_notify, r); if (smp->method == REQ_OOB) memcpy(r, smp->rr, 16); smp_f6(smp->tfm_cmac, smp->mackey, smp->prnd, smp->rrnd, r, io_cap, local_addr, remote_addr, check.e); smp_send_cmd(smp->conn, SMP_CMD_DHKEY_CHECK, sizeof(check), &check); } static u8 sc_passkey_send_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct smp_cmd_pairing_confirm cfm; u8 r; r = ((hcon->passkey_notify >> smp->passkey_round) & 0x01); r |= 0x80; get_random_bytes(smp->prnd, sizeof(smp->prnd)); if (smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd, r, cfm.confirm_val)) return SMP_UNSPECIFIED; smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cfm), &cfm); return 0; } static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; u8 cfm[16], r; /* Ignore the PDU if we've already done 20 rounds (0 - 19) */ if (smp->passkey_round >= 20) return 0; switch (smp_op) { case SMP_CMD_PAIRING_RANDOM: r = ((hcon->passkey_notify >> smp->passkey_round) & 0x01); r |= 0x80; if (smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk, smp->rrnd, r, cfm)) return SMP_UNSPECIFIED; if (crypto_memneq(smp->pcnf, cfm, 16)) return SMP_CONFIRM_FAILED; smp->passkey_round++; if (smp->passkey_round == 20) { /* Generate MacKey and LTK */ if (sc_mackey_and_ltk(smp, smp->mackey, smp->tk)) return SMP_UNSPECIFIED; } /* The round is only complete when the initiator * receives pairing random. */ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); if (smp->passkey_round == 20) SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); else SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); return 0; } /* Start the next round */ if (smp->passkey_round != 20) return sc_passkey_round(smp, 0); /* Passkey rounds are complete - start DHKey Check */ sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); break; case SMP_CMD_PAIRING_CONFIRM: if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) { set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); return 0; } SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); return 0; } return sc_passkey_send_confirm(smp); case SMP_CMD_PUBLIC_KEY: default: /* Initiating device starts the round */ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return 0; bt_dev_dbg(hdev, "Starting passkey round %u", smp->passkey_round + 1); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); return sc_passkey_send_confirm(smp); } return 0; } static int sc_user_reply(struct smp_chan *smp, u16 mgmt_op, __le32 passkey) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; u8 smp_op; clear_bit(SMP_FLAG_WAIT_USER, &smp->flags); switch (mgmt_op) { case MGMT_OP_USER_PASSKEY_NEG_REPLY: smp_failure(smp->conn, SMP_PASSKEY_ENTRY_FAILED); return 0; case MGMT_OP_USER_CONFIRM_NEG_REPLY: smp_failure(smp->conn, SMP_NUMERIC_COMP_FAILED); return 0; case MGMT_OP_USER_PASSKEY_REPLY: hcon->passkey_notify = le32_to_cpu(passkey); smp->passkey_round = 0; if (test_and_clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) smp_op = SMP_CMD_PAIRING_CONFIRM; else smp_op = 0; if (sc_passkey_round(smp, smp_op)) return -EIO; return 0; } /* Initiator sends DHKey check first */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } else if (test_and_clear_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags)) { sc_dhkey_check(smp); sc_add_ltk(smp); } return 0; } int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) { struct l2cap_conn *conn = hcon->l2cap_data; struct l2cap_chan *chan; struct smp_chan *smp; u32 value; int err; if (!conn) return -ENOTCONN; bt_dev_dbg(conn->hcon->hdev, ""); chan = conn->smp; if (!chan) return -ENOTCONN; l2cap_chan_lock(chan); if (!chan->data) { err = -ENOTCONN; goto unlock; } smp = chan->data; if (test_bit(SMP_FLAG_SC, &smp->flags)) { err = sc_user_reply(smp, mgmt_op, passkey); goto unlock; } switch (mgmt_op) { case MGMT_OP_USER_PASSKEY_REPLY: value = le32_to_cpu(passkey); memset(smp->tk, 0, sizeof(smp->tk)); bt_dev_dbg(conn->hcon->hdev, "PassKey: %u", value); put_unaligned_le32(value, smp->tk); fallthrough; case MGMT_OP_USER_CONFIRM_REPLY: set_bit(SMP_FLAG_TK_VALID, &smp->flags); break; case MGMT_OP_USER_PASSKEY_NEG_REPLY: case MGMT_OP_USER_CONFIRM_NEG_REPLY: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); err = 0; goto unlock; default: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); err = -EOPNOTSUPP; goto unlock; } err = 0; /* If it is our turn to send Pairing Confirm, do so now */ if (test_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) { u8 rsp = smp_confirm(smp); if (rsp) smp_failure(conn, rsp); } unlock: l2cap_chan_unlock(chan); return err; } static void build_bredr_pairing_cmd(struct smp_chan *smp, struct smp_cmd_pairing *req, struct smp_cmd_pairing *rsp) { struct l2cap_conn *conn = smp->conn; struct hci_dev *hdev = conn->hcon->hdev; u8 local_dist = 0, remote_dist = 0; if (hci_dev_test_flag(hdev, HCI_BONDABLE)) { local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; } if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING)) remote_dist |= SMP_DIST_ID_KEY; if (hci_dev_test_flag(hdev, HCI_PRIVACY)) local_dist |= SMP_DIST_ID_KEY; if (!rsp) { memset(req, 0, sizeof(*req)); req->auth_req = SMP_AUTH_CT2; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->max_key_size = conn->hcon->enc_key_size; smp->remote_key_dist = remote_dist; return; } memset(rsp, 0, sizeof(*rsp)); rsp->auth_req = SMP_AUTH_CT2; rsp->max_key_size = conn->hcon->enc_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; smp->remote_key_dist = rsp->init_key_dist; } static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing rsp, *req = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; u8 key_size, auth, sec_level; int ret; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*req)) return SMP_INVALID_PARAMS; if (smp && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_CMD_NOTSUPP; if (!smp) { smp = smp_chan_create(conn); if (!smp) return SMP_UNSPECIFIED; } /* We didn't start the pairing, so match remote */ auth = req->auth_req & AUTH_REQ_MASK(hdev); if (!hci_dev_test_flag(hdev, HCI_BONDABLE) && (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], req, sizeof(*req)); skb_pull(skb, sizeof(*req)); /* If the remote side's OOB flag is set it means it has * successfully received our local OOB data - therefore set the * flag to indicate that local OOB is in use. */ if (req->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob) set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); /* SMP over BR/EDR requires special treatment */ if (conn->hcon->type == ACL_LINK) { /* We must have a BR/EDR SC link */ if (!test_bit(HCI_CONN_AES_CCM, &conn->hcon->flags) && !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return SMP_CROSS_TRANSP_NOT_ALLOWED; set_bit(SMP_FLAG_SC, &smp->flags); build_bredr_pairing_cmd(smp, req, &rsp); if (req->auth_req & SMP_AUTH_CT2) set_bit(SMP_FLAG_CT2, &smp->flags); key_size = min(req->max_key_size, rsp.max_key_size); if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], &rsp, sizeof(rsp)); smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp); smp_distribute_keys(smp); return 0; } build_pairing_cmd(conn, req, &rsp, auth); if (rsp.auth_req & SMP_AUTH_SC) { set_bit(SMP_FLAG_SC, &smp->flags); if (rsp.auth_req & SMP_AUTH_CT2) set_bit(SMP_FLAG_CT2, &smp->flags); } if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) sec_level = BT_SECURITY_MEDIUM; else sec_level = authreq_to_seclevel(auth); if (sec_level > conn->hcon->pending_sec_level) conn->hcon->pending_sec_level = sec_level; /* If we need MITM check that it can be achieved */ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { u8 method; method = get_auth_method(smp, conn->hcon->io_capability, req->io_capability); if (method == JUST_WORKS || method == JUST_CFM) return SMP_AUTH_REQUIREMENTS; } key_size = min(req->max_key_size, rsp.max_key_size); if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; get_random_bytes(smp->prnd, sizeof(smp->prnd)); smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], &rsp, sizeof(rsp)); smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp); clear_bit(SMP_FLAG_INITIATOR, &smp->flags); /* Strictly speaking we shouldn't allow Pairing Confirm for the * SC case, however some implementations incorrectly copy RFU auth * req bits from our security request, which may create a false * positive SC enablement. */ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); if (test_bit(SMP_FLAG_SC, &smp->flags)) { SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY); /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; /* Wait for Public Key from Initiating Device */ return 0; } /* Request setup of TK */ ret = tk_request(conn, 0, auth, rsp.io_capability, req->io_capability); if (ret) return SMP_UNSPECIFIED; return 0; } static u8 sc_send_public_key(struct smp_chan *smp) { struct hci_dev *hdev = smp->conn->hcon->hdev; bt_dev_dbg(hdev, ""); if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { struct l2cap_chan *chan = hdev->smp_data; struct smp_dev *smp_dev; if (!chan || !chan->data) return SMP_UNSPECIFIED; smp_dev = chan->data; memcpy(smp->local_pk, smp_dev->local_pk, 64); memcpy(smp->lr, smp_dev->local_rand, 16); if (smp_dev->debug_key) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); goto done; } if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { bt_dev_dbg(hdev, "Using debug keys"); if (set_ecdh_privkey(smp->tfm_ecdh, debug_sk)) return SMP_UNSPECIFIED; memcpy(smp->local_pk, debug_pk, 64); set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); } else { while (true) { /* Generate key pair for Secure Connections */ if (generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk)) return SMP_UNSPECIFIED; /* This is unlikely, but we need to check that * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; } } done: SMP_DBG("Local Public Key X: %32phN", smp->local_pk); SMP_DBG("Local Public Key Y: %32phN", smp->local_pk + 32); smp_send_cmd(smp->conn, SMP_CMD_PUBLIC_KEY, 64, smp->local_pk); return 0; } static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing *req, *rsp = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; u8 key_size, auth; int ret; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rsp)) return SMP_INVALID_PARAMS; if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_CMD_NOTSUPP; skb_pull(skb, sizeof(*rsp)); req = (void *) &smp->preq[1]; key_size = min(req->max_key_size, rsp->max_key_size); if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; auth = rsp->auth_req & AUTH_REQ_MASK(hdev); if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; /* If the remote side's OOB flag is set it means it has * successfully received our local OOB data - therefore set the * flag to indicate that local OOB is in use. */ if (rsp->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob) set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], rsp, sizeof(*rsp)); /* Update remote key distribution in case the remote cleared * some bits that we had enabled in our request. */ smp->remote_key_dist &= rsp->resp_key_dist; if ((req->auth_req & SMP_AUTH_CT2) && (auth & SMP_AUTH_CT2)) set_bit(SMP_FLAG_CT2, &smp->flags); /* For BR/EDR this means we're done and can start phase 3 */ if (conn->hcon->type == ACL_LINK) { /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; smp_distribute_keys(smp); return 0; } if ((req->auth_req & SMP_AUTH_SC) && (auth & SMP_AUTH_SC)) set_bit(SMP_FLAG_SC, &smp->flags); else if (conn->hcon->pending_sec_level > BT_SECURITY_HIGH) conn->hcon->pending_sec_level = BT_SECURITY_HIGH; /* If we need MITM check that it can be achieved */ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { u8 method; method = get_auth_method(smp, req->io_capability, rsp->io_capability); if (method == JUST_WORKS || method == JUST_CFM) return SMP_AUTH_REQUIREMENTS; } get_random_bytes(smp->prnd, sizeof(smp->prnd)); /* Update remote key distribution in case the remote cleared * some bits that we had enabled in our request. */ smp->remote_key_dist &= rsp->resp_key_dist; if (test_bit(SMP_FLAG_SC, &smp->flags)) { /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY); return sc_send_public_key(smp); } auth |= req->auth_req; ret = tk_request(conn, 0, auth, req->io_capability, rsp->io_capability); if (ret) return SMP_UNSPECIFIED; set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); /* Can't compose response until we have been confirmed */ if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) return smp_confirm(smp); return 0; } static u8 sc_check_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; bt_dev_dbg(conn->hcon->hdev, ""); if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); } return 0; } /* Work-around for some implementations that incorrectly copy RFU bits * from our security request and thereby create the impression that * we're doing SC when in fact the remote doesn't support it. */ static int fixup_sc_false_positive(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_cmd_pairing *req, *rsp; u8 auth; /* The issue is only observed when we're in responder role */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_UNSPECIFIED; if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) { bt_dev_err(hdev, "refusing legacy fallback in SC-only mode"); return SMP_UNSPECIFIED; } bt_dev_err(hdev, "trying to fall back to legacy SMP"); req = (void *) &smp->preq[1]; rsp = (void *) &smp->prsp[1]; /* Rebuild key dist flags which may have been cleared for SC */ smp->remote_key_dist = (req->init_key_dist & rsp->resp_key_dist); auth = req->auth_req & AUTH_REQ_MASK(hdev); if (tk_request(conn, 0, auth, rsp->io_capability, req->io_capability)) { bt_dev_err(hdev, "failed to fall back to legacy SMP"); return SMP_UNSPECIFIED; } clear_bit(SMP_FLAG_SC, &smp->flags); return 0; } static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; bt_dev_dbg(hdev, "conn %p %s", conn, test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" : "responder"); if (skb->len < sizeof(smp->pcnf)) return SMP_INVALID_PARAMS; memcpy(smp->pcnf, skb->data, sizeof(smp->pcnf)); skb_pull(skb, sizeof(smp->pcnf)); if (test_bit(SMP_FLAG_SC, &smp->flags)) { int ret; /* Public Key exchange must happen before any other steps */ if (test_bit(SMP_FLAG_REMOTE_PK, &smp->flags)) return sc_check_confirm(smp); bt_dev_err(hdev, "Unexpected SMP Pairing Confirm"); ret = fixup_sc_false_positive(smp); if (ret) return ret; } if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) return smp_confirm(smp); set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); return 0; } static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; u8 *pkax, *pkbx, *na, *nb, confirm_hint; u32 passkey = 0; int err; bt_dev_dbg(hcon->hdev, "conn %p", conn); if (skb->len < sizeof(smp->rrnd)) return SMP_INVALID_PARAMS; memcpy(smp->rrnd, skb->data, sizeof(smp->rrnd)); skb_pull(skb, sizeof(smp->rrnd)); if (!test_bit(SMP_FLAG_SC, &smp->flags)) return smp_random(smp); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { pkax = smp->local_pk; pkbx = smp->remote_pk; na = smp->prnd; nb = smp->rrnd; } else { pkax = smp->remote_pk; pkbx = smp->local_pk; na = smp->rrnd; nb = smp->prnd; } if (smp->method == REQ_OOB) { if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); goto mackey_and_ltk; } /* Passkey entry has special treatment */ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_RANDOM); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { u8 cfm[16]; err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk, smp->rrnd, 0, cfm); if (err) return SMP_UNSPECIFIED; if (crypto_memneq(smp->pcnf, cfm, 16)) return SMP_CONFIRM_FAILED; } else { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } mackey_and_ltk: /* Generate MacKey and LTK */ err = sc_mackey_and_ltk(smp, smp->mackey, smp->tk); if (err) return SMP_UNSPECIFIED; if (smp->method == REQ_OOB) { if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } return 0; } err = smp_g2(smp->tfm_cmac, pkax, pkbx, na, nb, &passkey); if (err) return SMP_UNSPECIFIED; /* Always require user confirmation for Just-Works pairing to prevent * impersonation attacks, or in case of a legitimate device that is * repairing use the confirmation as acknowledgment to proceed with the * creation of new keys. */ confirm_hint = smp->method == JUST_WORKS ? 1 : 0; err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, confirm_hint); if (err) return SMP_UNSPECIFIED; set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) { struct smp_ltk *key; struct hci_conn *hcon = conn->hcon; key = hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role); if (!key) return false; if (smp_ltk_sec_level(key) < sec_level) return false; if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return true; hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size); hcon->enc_key_size = key->enc_size; /* We never store STKs for initiator role, so clear this flag */ clear_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); return true; } bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level, enum smp_key_pref key_pref) { if (sec_level == BT_SECURITY_LOW) return true; /* If we're encrypted with an STK but the caller prefers using * LTK claim insufficient security. This way we allow the * connection to be re-encrypted with an LTK, even if the LTK * provides the same level of security. Only exception is if we * don't have an LTK (e.g. because of key distribution bits). */ if (key_pref == SMP_USE_LTK && test_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags) && hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role)) return false; if (hcon->sec_level >= sec_level) return true; return false; } static void smp_send_pairing_req(struct smp_chan *smp, __u8 auth) { struct smp_cmd_pairing cp; if (smp->conn->hcon->type == ACL_LINK) build_bredr_pairing_cmd(smp, &cp, NULL); else build_pairing_cmd(smp->conn, &cp, NULL, auth); smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], &cp, sizeof(cp)); smp_send_cmd(smp->conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); set_bit(SMP_FLAG_INITIATOR, &smp->flags); } static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_security_req *rp = (void *) skb->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_chan *smp; u8 sec_level, auth; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; if (hcon->role != HCI_ROLE_MASTER) return SMP_CMD_NOTSUPP; auth = rp->auth_req & AUTH_REQ_MASK(hdev); if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; if (hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) sec_level = BT_SECURITY_MEDIUM; else sec_level = authreq_to_seclevel(auth); if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK)) { /* If link is already encrypted with sufficient security we * still need refresh encryption as per Core Spec 5.0 Vol 3, * Part H 2.4.6 */ smp_ltk_encrypt(conn, hcon->sec_level); return 0; } if (sec_level > hcon->pending_sec_level) hcon->pending_sec_level = sec_level; if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; smp = smp_chan_create(conn); if (!smp) return SMP_UNSPECIFIED; if (!hci_dev_test_flag(hdev, HCI_BONDABLE) && (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; skb_pull(skb, sizeof(*rp)); smp_send_pairing_req(smp, auth); return 0; } static void smp_send_security_req(struct smp_chan *smp, __u8 auth) { struct smp_cmd_security_req cp; cp.auth_req = auth; smp_send_cmd(smp->conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); clear_bit(SMP_FLAG_INITIATOR, &smp->flags); } int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { struct l2cap_conn *conn = hcon->l2cap_data; struct l2cap_chan *chan; struct smp_chan *smp; __u8 authreq; int ret; bt_dev_dbg(hcon->hdev, "conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level); /* This may be NULL if there's an unexpected disconnection */ if (!conn) return 1; if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) return 1; if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK)) return 1; if (sec_level > hcon->pending_sec_level) hcon->pending_sec_level = sec_level; if (hcon->role == HCI_ROLE_MASTER) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; chan = conn->smp; if (!chan) { bt_dev_err(hcon->hdev, "security requested but not available"); return 1; } l2cap_chan_lock(chan); /* If SMP is already in progress ignore this request */ if (chan->data) { ret = 0; goto unlock; } smp = smp_chan_create(conn); if (!smp) { ret = 1; goto unlock; } authreq = seclevel_to_authreq(sec_level); if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) { authreq |= SMP_AUTH_SC; if (hci_dev_test_flag(hcon->hdev, HCI_SSP_ENABLED)) authreq |= SMP_AUTH_CT2; } /* Don't attempt to set MITM if setting is overridden by debugfs * Needed to pass certification test SM/MAS/PKE/BV-01-C */ if (!hci_dev_test_flag(hcon->hdev, HCI_FORCE_NO_MITM)) { /* Require MITM if IO Capability allows or the security level * requires it. */ if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || hcon->pending_sec_level > BT_SECURITY_MEDIUM) authreq |= SMP_AUTH_MITM; } if (hcon->role == HCI_ROLE_MASTER) smp_send_pairing_req(smp, authreq); else smp_send_security_req(smp, authreq); ret = 0; unlock: l2cap_chan_unlock(chan); return ret; } int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct hci_conn *hcon; struct l2cap_conn *conn; struct l2cap_chan *chan; struct smp_chan *smp; int err; err = hci_remove_ltk(hdev, bdaddr, addr_type); hci_remove_irk(hdev, bdaddr, addr_type); hcon = hci_conn_hash_lookup_le(hdev, bdaddr, addr_type); if (!hcon) goto done; conn = hcon->l2cap_data; if (!conn) goto done; chan = conn->smp; if (!chan) goto done; l2cap_chan_lock(chan); smp = chan->data; if (smp) { /* Set keys to NULL to make sure smp_failure() does not try to * remove and free already invalidated rcu list entries. */ smp->ltk = NULL; smp->responder_ltk = NULL; smp->remote_irk = NULL; if (test_bit(SMP_FLAG_COMPLETE, &smp->flags)) smp_failure(conn, 0); else smp_failure(conn, SMP_UNSPECIFIED); err = 0; } l2cap_chan_unlock(chan); done: return err; } static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_encrypt_info *rp = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; /* Pairing is aborted if any blocked keys are distributed */ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_LTK, rp->ltk)) { bt_dev_warn_ratelimited(conn->hcon->hdev, "LTK blocked for %pMR", &conn->hcon->dst); return SMP_INVALID_PARAMS; } SMP_ALLOW_CMD(smp, SMP_CMD_INITIATOR_IDENT); skb_pull(skb, sizeof(*rp)); memcpy(smp->tk, rp->ltk, sizeof(smp->tk)); return 0; } static int smp_cmd_initiator_ident(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_initiator_ident *rp = (void *)skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; struct hci_conn *hcon = conn->hcon; struct smp_ltk *ltk; u8 authenticated; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ENC_KEY; if (smp->remote_key_dist & SMP_DIST_ID_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); else if (smp->remote_key_dist & SMP_DIST_SIGN) SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); skb_pull(skb, sizeof(*rp)); authenticated = (hcon->sec_level == BT_SECURITY_HIGH); ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, SMP_LTK, authenticated, smp->tk, smp->enc_key_size, rp->ediv, rp->rand); smp->ltk = ltk; if (!(smp->remote_key_dist & KEY_DIST_MASK)) smp_distribute_keys(smp); return 0; } static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_info *info = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; bt_dev_dbg(conn->hcon->hdev, ""); if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; /* Pairing is aborted if any blocked keys are distributed */ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_IRK, info->irk)) { bt_dev_warn_ratelimited(conn->hcon->hdev, "Identity key blocked for %pMR", &conn->hcon->dst); return SMP_INVALID_PARAMS; } SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO); skb_pull(skb, sizeof(*info)); memcpy(smp->irk, info->irk, 16); return 0; } static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_addr_info *info = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; bdaddr_t rpa; bt_dev_dbg(hcon->hdev, ""); if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ID_KEY; if (smp->remote_key_dist & SMP_DIST_SIGN) SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); skb_pull(skb, sizeof(*info)); /* Strictly speaking the Core Specification (4.1) allows sending * an empty address which would force us to rely on just the IRK * as "identity information". However, since such * implementations are not known of and in order to not over * complicate our implementation, simply pretend that we never * received an IRK for such a device. * * The Identity Address must also be a Static Random or Public * Address, which hci_is_identity_address() checks for. */ if (!bacmp(&info->bdaddr, BDADDR_ANY) || !hci_is_identity_address(&info->bdaddr, info->addr_type)) { bt_dev_err(hcon->hdev, "ignoring IRK with no identity address"); goto distribute; } /* Drop IRK if peer is using identity address during pairing but is * providing different address as identity information. * * Microsoft Surface Precision Mouse is known to have this bug. */ if (hci_is_identity_address(&hcon->dst, hcon->dst_type) && (bacmp(&info->bdaddr, &hcon->dst) || info->addr_type != hcon->dst_type)) { bt_dev_err(hcon->hdev, "ignoring IRK with invalid identity address"); goto distribute; } bacpy(&smp->id_addr, &info->bdaddr); smp->id_addr_type = info->addr_type; if (hci_bdaddr_is_rpa(&hcon->dst, hcon->dst_type)) bacpy(&rpa, &hcon->dst); else bacpy(&rpa, BDADDR_ANY); smp->remote_irk = hci_add_irk(conn->hcon->hdev, &smp->id_addr, smp->id_addr_type, smp->irk, &rpa); distribute: if (!(smp->remote_key_dist & KEY_DIST_MASK)) smp_distribute_keys(smp); return 0; } static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_sign_info *rp = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct smp_csrk *csrk; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_SIGN; skb_pull(skb, sizeof(*rp)); csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); if (csrk) { if (conn->hcon->sec_level > BT_SECURITY_MEDIUM) csrk->type = MGMT_CSRK_REMOTE_AUTHENTICATED; else csrk->type = MGMT_CSRK_REMOTE_UNAUTHENTICATED; memcpy(csrk->val, rp->csrk, sizeof(csrk->val)); } smp->csrk = csrk; smp_distribute_keys(smp); return 0; } static u8 sc_select_method(struct smp_chan *smp) { struct smp_cmd_pairing *local, *remote; u8 local_mitm, remote_mitm, local_io, remote_io, method; if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags) || test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) return REQ_OOB; /* The preq/prsp contain the raw Pairing Request/Response PDUs * which are needed as inputs to some crypto functions. To get * the "struct smp_cmd_pairing" from them we need to skip the * first byte which contains the opcode. */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local = (void *) &smp->preq[1]; remote = (void *) &smp->prsp[1]; } else { local = (void *) &smp->prsp[1]; remote = (void *) &smp->preq[1]; } local_io = local->io_capability; remote_io = remote->io_capability; local_mitm = (local->auth_req & SMP_AUTH_MITM); remote_mitm = (remote->auth_req & SMP_AUTH_MITM); /* If either side wants MITM, look up the method from the table, * otherwise use JUST WORKS. */ if (local_mitm || remote_mitm) method = get_auth_method(smp, local_io, remote_io); else method = JUST_WORKS; /* Don't confirm locally initiated pairing attempts */ if (method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) method = JUST_WORKS; return method; } static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_public_key *key = (void *) skb->data; struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = hcon->hdev; struct crypto_kpp *tfm_ecdh; struct smp_cmd_pairing_confirm cfm; int err; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*key)) return SMP_INVALID_PARAMS; /* Check if remote and local public keys are the same and debug key is * not in use. */ if (!test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags) && !crypto_memneq(key, smp->local_pk, 64)) { bt_dev_err(hdev, "Remote and local public keys are identical"); return SMP_UNSPECIFIED; } memcpy(smp->remote_pk, key, 64); if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags)) { err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->remote_pk, smp->rr, 0, cfm.confirm_val); if (err) return SMP_UNSPECIFIED; if (crypto_memneq(cfm.confirm_val, smp->pcnf, 16)) return SMP_CONFIRM_FAILED; } /* Non-initiating device sends its public key after receiving * the key from the initiating device. */ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { err = sc_send_public_key(smp); if (err) return err; } SMP_DBG("Remote Public Key X: %32phN", smp->remote_pk); SMP_DBG("Remote Public Key Y: %32phN", smp->remote_pk + 32); /* Compute the shared secret on the same crypto tfm on which the private * key was set/generated. */ if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { struct l2cap_chan *hchan = hdev->smp_data; struct smp_dev *smp_dev; if (!hchan || !hchan->data) return SMP_UNSPECIFIED; smp_dev = hchan->data; tfm_ecdh = smp_dev->tfm_ecdh; } else { tfm_ecdh = smp->tfm_ecdh; } if (compute_ecdh_secret(tfm_ecdh, smp->remote_pk, smp->dhkey)) return SMP_UNSPECIFIED; SMP_DBG("DHKey %32phN", smp->dhkey); set_bit(SMP_FLAG_REMOTE_PK, &smp->flags); smp->method = sc_select_method(smp); bt_dev_dbg(hdev, "selected method 0x%02x", smp->method); /* JUST_WORKS and JUST_CFM result in an unauthenticated key */ if (smp->method == JUST_WORKS || smp->method == JUST_CFM) hcon->pending_sec_level = BT_SECURITY_MEDIUM; else hcon->pending_sec_level = BT_SECURITY_FIPS; if (!crypto_memneq(debug_pk, smp->remote_pk, 64)) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); if (smp->method == DSP_PASSKEY) { get_random_bytes(&hcon->passkey_notify, sizeof(hcon->passkey_notify)); hcon->passkey_notify %= 1000000; hcon->passkey_entered = 0; smp->passkey_round = 0; if (mgmt_user_passkey_notify(hdev, &hcon->dst, hcon->type, hcon->dst_type, hcon->passkey_notify, hcon->passkey_entered)) return SMP_UNSPECIFIED; SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); return sc_passkey_round(smp, SMP_CMD_PUBLIC_KEY); } if (smp->method == REQ_OOB) { if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); if (smp->method == REQ_PASSKEY) { if (mgmt_user_passkey_request(hdev, &hcon->dst, hcon->type, hcon->dst_type)) return SMP_UNSPECIFIED; SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } /* The Initiating device waits for the non-initiating device to * send the confirm value. */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return 0; err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd, 0, cfm.confirm_val); if (err) return SMP_UNSPECIFIED; smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cfm), &cfm); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_dhkey_check *check = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct hci_conn *hcon = conn->hcon; struct smp_chan *smp = chan->data; u8 a[7], b[7], *local_addr, *remote_addr; u8 io_cap[3], r[16], e[16]; int err; bt_dev_dbg(hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*check)) return SMP_INVALID_PARAMS; memcpy(a, &hcon->init_addr, 6); memcpy(b, &hcon->resp_addr, 6); a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local_addr = a; remote_addr = b; memcpy(io_cap, &smp->prsp[1], 3); } else { local_addr = b; remote_addr = a; memcpy(io_cap, &smp->preq[1], 3); } memset(r, 0, sizeof(r)); if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) put_unaligned_le32(hcon->passkey_notify, r); else if (smp->method == REQ_OOB) memcpy(r, smp->lr, 16); err = smp_f6(smp->tfm_cmac, smp->mackey, smp->rrnd, smp->prnd, r, io_cap, remote_addr, local_addr, e); if (err) return SMP_UNSPECIFIED; if (crypto_memneq(check->e, e, 16)) return SMP_DHKEY_CHECK_FAILED; if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) { set_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags); return 0; } /* Responder sends DHKey check as response to initiator */ sc_dhkey_check(smp); } sc_add_ltk(smp); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; } return 0; } static int smp_cmd_keypress_notify(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_keypress_notify *kp = (void *) skb->data; bt_dev_dbg(conn->hcon->hdev, "value 0x%02x", kp->value); return 0; } static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; struct smp_chan *smp; __u8 code, reason; int err = 0; if (skb->len < 1) return -EILSEQ; if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) { reason = SMP_PAIRING_NOTSUPP; goto done; } code = skb->data[0]; skb_pull(skb, sizeof(code)); smp = chan->data; if (code > SMP_CMD_MAX) goto drop; if (smp && !test_and_clear_bit(code, &smp->allow_cmd)) { /* If there is a context and the command is not allowed consider * it a failure so the session is cleanup properly. */ switch (code) { case SMP_CMD_IDENT_INFO: case SMP_CMD_IDENT_ADDR_INFO: case SMP_CMD_SIGN_INFO: /* 3.6.1. Key distribution and generation * * A device may reject a distributed key by sending the * Pairing Failed command with the reason set to * "Key Rejected". */ smp_failure(conn, SMP_KEY_REJECTED); break; } goto drop; } /* If we don't have a context the only allowed commands are * pairing request and security request. */ if (!smp && code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ) goto drop; switch (code) { case SMP_CMD_PAIRING_REQ: reason = smp_cmd_pairing_req(conn, skb); break; case SMP_CMD_PAIRING_FAIL: smp_failure(conn, 0); err = -EPERM; break; case SMP_CMD_PAIRING_RSP: reason = smp_cmd_pairing_rsp(conn, skb); break; case SMP_CMD_SECURITY_REQ: reason = smp_cmd_security_req(conn, skb); break; case SMP_CMD_PAIRING_CONFIRM: reason = smp_cmd_pairing_confirm(conn, skb); break; case SMP_CMD_PAIRING_RANDOM: reason = smp_cmd_pairing_random(conn, skb); break; case SMP_CMD_ENCRYPT_INFO: reason = smp_cmd_encrypt_info(conn, skb); break; case SMP_CMD_INITIATOR_IDENT: reason = smp_cmd_initiator_ident(conn, skb); break; case SMP_CMD_IDENT_INFO: reason = smp_cmd_ident_info(conn, skb); break; case SMP_CMD_IDENT_ADDR_INFO: reason = smp_cmd_ident_addr_info(conn, skb); break; case SMP_CMD_SIGN_INFO: reason = smp_cmd_sign_info(conn, skb); break; case SMP_CMD_PUBLIC_KEY: reason = smp_cmd_public_key(conn, skb); break; case SMP_CMD_DHKEY_CHECK: reason = smp_cmd_dhkey_check(conn, skb); break; case SMP_CMD_KEYPRESS_NOTIFY: reason = smp_cmd_keypress_notify(conn, skb); break; default: bt_dev_dbg(hcon->hdev, "Unknown command code 0x%2.2x", code); reason = SMP_CMD_NOTSUPP; goto done; } done: if (!err) { if (reason) smp_failure(conn, reason); kfree_skb(skb); } return err; drop: bt_dev_err(hcon->hdev, "unexpected SMP command 0x%02x from %pMR", code, &hcon->dst); kfree_skb(skb); return 0; } static void smp_teardown_cb(struct l2cap_chan *chan, int err) { struct l2cap_conn *conn = chan->conn; bt_dev_dbg(conn->hcon->hdev, "chan %p", chan); if (chan->data) smp_chan_destroy(conn); conn->smp = NULL; l2cap_chan_put(chan); } static void bredr_pairing(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_chan *smp; bt_dev_dbg(hdev, "chan %p", chan); /* Only new pairings are interesting */ if (!test_bit(HCI_CONN_NEW_LINK_KEY, &hcon->flags)) return; /* Don't bother if we're not encrypted */ if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) return; /* Only initiator may initiate SMP over BR/EDR */ if (hcon->role != HCI_ROLE_MASTER) return; /* Secure Connections support must be enabled */ if (!hci_dev_test_flag(hdev, HCI_SC_ENABLED)) return; /* BR/EDR must use Secure Connections for SMP */ if (!test_bit(HCI_CONN_AES_CCM, &hcon->flags) && !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return; /* If our LE support is not enabled don't do anything */ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; /* Don't bother if remote LE support is not enabled */ if (!lmp_host_le_capable(hcon)) return; /* Remote must support SMP fixed chan for BR/EDR */ if (!(conn->remote_fixed_chan & L2CAP_FC_SMP_BREDR)) return; /* Don't bother if SMP is already ongoing */ if (chan->data) return; smp = smp_chan_create(conn); if (!smp) { bt_dev_err(hdev, "unable to create SMP context for BR/EDR"); return; } set_bit(SMP_FLAG_SC, &smp->flags); bt_dev_dbg(hdev, "starting SMP over BR/EDR"); smp_send_pairing_req(smp, 0x00); } static void smp_resume_cb(struct l2cap_chan *chan) { struct smp_chan *smp = chan->data; struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; bt_dev_dbg(hcon->hdev, "chan %p", chan); if (hcon->type == ACL_LINK) { bredr_pairing(chan); return; } if (!smp) return; if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) return; cancel_delayed_work(&smp->security_timer); smp_distribute_keys(smp); } static void smp_ready_cb(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; bt_dev_dbg(hcon->hdev, "chan %p", chan); /* No need to call l2cap_chan_hold() here since we already own * the reference taken in smp_new_conn_cb(). This is just the * first time that we tie it to a specific pointer. The code in * l2cap_core.c ensures that there's no risk this function won't * get called if smp_new_conn_cb was previously called. */ conn->smp = chan; if (hcon->type == ACL_LINK && test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) bredr_pairing(chan); } static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { int err; bt_dev_dbg(chan->conn->hcon->hdev, "chan %p", chan); err = smp_sig_channel(chan, skb); if (err) { struct smp_chan *smp = chan->data; if (smp) cancel_delayed_work_sync(&smp->security_timer); hci_disconnect(chan->conn->hcon, HCI_ERROR_AUTH_FAILURE); } return err; } static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { struct sk_buff *skb; skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); skb->priority = HCI_PRIO_MAX; bt_cb(skb)->l2cap.chan = chan; return skb; } static const struct l2cap_ops smp_chan_ops = { .name = "Security Manager", .ready = smp_ready_cb, .recv = smp_recv_cb, .alloc_skb = smp_alloc_skb_cb, .teardown = smp_teardown_cb, .resume = smp_resume_cb, .new_connection = l2cap_chan_no_new_connection, .state_change = l2cap_chan_no_state_change, .close = l2cap_chan_no_close, .defer = l2cap_chan_no_defer, .suspend = l2cap_chan_no_suspend, .set_shutdown = l2cap_chan_no_set_shutdown, .get_sndtimeo = l2cap_chan_no_get_sndtimeo, }; static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; BT_DBG("pchan %p", pchan); chan = l2cap_chan_create(); if (!chan) return NULL; chan->chan_type = pchan->chan_type; chan->ops = &smp_chan_ops; chan->scid = pchan->scid; chan->dcid = chan->scid; chan->imtu = pchan->imtu; chan->omtu = pchan->omtu; chan->mode = pchan->mode; /* Other L2CAP channels may request SMP routines in order to * change the security level. This means that the SMP channel * lock must be considered in its own category to avoid lockdep * warnings. */ atomic_set(&chan->nesting, L2CAP_NESTING_SMP); BT_DBG("created chan %p", chan); return chan; } static const struct l2cap_ops smp_root_chan_ops = { .name = "Security Manager Root", .new_connection = smp_new_conn_cb, /* None of these are implemented for the root channel */ .close = l2cap_chan_no_close, .alloc_skb = l2cap_chan_no_alloc_skb, .recv = l2cap_chan_no_recv, .state_change = l2cap_chan_no_state_change, .teardown = l2cap_chan_no_teardown, .ready = l2cap_chan_no_ready, .defer = l2cap_chan_no_defer, .suspend = l2cap_chan_no_suspend, .resume = l2cap_chan_no_resume, .set_shutdown = l2cap_chan_no_set_shutdown, .get_sndtimeo = l2cap_chan_no_get_sndtimeo, }; static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) { struct l2cap_chan *chan; struct smp_dev *smp; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; if (cid == L2CAP_CID_SMP_BREDR) { smp = NULL; goto create_chan; } smp = kzalloc(sizeof(*smp), GFP_KERNEL); if (!smp) return ERR_PTR(-ENOMEM); tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { bt_dev_err(hdev, "Unable to create CMAC crypto context"); kfree_sensitive(smp); return ERR_CAST(tfm_cmac); } tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0); if (IS_ERR(tfm_ecdh)) { bt_dev_err(hdev, "Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); kfree_sensitive(smp); return ERR_CAST(tfm_ecdh); } smp->local_oob = false; smp->tfm_cmac = tfm_cmac; smp->tfm_ecdh = tfm_ecdh; create_chan: chan = l2cap_chan_create(); if (!chan) { if (smp) { crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); kfree_sensitive(smp); } return ERR_PTR(-ENOMEM); } chan->data = smp; l2cap_add_scid(chan, cid); l2cap_chan_set_defaults(chan); if (cid == L2CAP_CID_SMP) { u8 bdaddr_type; hci_copy_identity_address(hdev, &chan->src, &bdaddr_type); if (bdaddr_type == ADDR_LE_DEV_PUBLIC) chan->src_type = BDADDR_LE_PUBLIC; else chan->src_type = BDADDR_LE_RANDOM; } else { bacpy(&chan->src, &hdev->bdaddr); chan->src_type = BDADDR_BREDR; } chan->state = BT_LISTEN; chan->mode = L2CAP_MODE_BASIC; chan->imtu = L2CAP_DEFAULT_MTU; chan->ops = &smp_root_chan_ops; /* Set correct nesting level for a parent/listening channel */ atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); return chan; } static void smp_del_chan(struct l2cap_chan *chan) { struct smp_dev *smp; BT_DBG("chan %p", chan); smp = chan->data; if (smp) { chan->data = NULL; crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); kfree_sensitive(smp); } l2cap_chan_put(chan); } int smp_force_bredr(struct hci_dev *hdev, bool enable) { if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return -EALREADY; if (enable) { struct l2cap_chan *chan; chan = smp_add_cid(hdev, L2CAP_CID_SMP_BREDR); if (IS_ERR(chan)) return PTR_ERR(chan); hdev->smp_bredr_data = chan; } else { struct l2cap_chan *chan; chan = hdev->smp_bredr_data; hdev->smp_bredr_data = NULL; smp_del_chan(chan); } hci_dev_change_flag(hdev, HCI_FORCE_BREDR_SMP); return 0; } int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; bt_dev_dbg(hdev, ""); /* If the controller does not support Low Energy operation, then * there is also no need to register any SMP channel. */ if (!lmp_le_capable(hdev)) return 0; if (WARN_ON(hdev->smp_data)) { chan = hdev->smp_data; hdev->smp_data = NULL; smp_del_chan(chan); } chan = smp_add_cid(hdev, L2CAP_CID_SMP); if (IS_ERR(chan)) return PTR_ERR(chan); hdev->smp_data = chan; if (!lmp_sc_capable(hdev)) { /* Flag can be already set here (due to power toggle) */ if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return 0; } if (WARN_ON(hdev->smp_bredr_data)) { chan = hdev->smp_bredr_data; hdev->smp_bredr_data = NULL; smp_del_chan(chan); } chan = smp_add_cid(hdev, L2CAP_CID_SMP_BREDR); if (IS_ERR(chan)) { int err = PTR_ERR(chan); chan = hdev->smp_data; hdev->smp_data = NULL; smp_del_chan(chan); return err; } hdev->smp_bredr_data = chan; return 0; } void smp_unregister(struct hci_dev *hdev) { struct l2cap_chan *chan; if (hdev->smp_bredr_data) { chan = hdev->smp_bredr_data; hdev->smp_bredr_data = NULL; smp_del_chan(chan); } if (hdev->smp_data) { chan = hdev->smp_data; hdev->smp_data = NULL; smp_del_chan(chan); } } #if IS_ENABLED(CONFIG_BT_SELFTEST_SMP) static int __init test_debug_key(struct crypto_kpp *tfm_ecdh) { u8 pk[64]; int err; err = set_ecdh_privkey(tfm_ecdh, debug_sk); if (err) return err; err = generate_ecdh_public_key(tfm_ecdh, pk); if (err) return err; if (crypto_memneq(pk, debug_pk, 64)) return -EINVAL; return 0; } static int __init test_ah(void) { const u8 irk[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec }; const u8 r[3] = { 0x94, 0x81, 0x70 }; const u8 exp[3] = { 0xaa, 0xfb, 0x0d }; u8 res[3]; int err; err = smp_ah(irk, r, res); if (err) return err; if (crypto_memneq(res, exp, 3)) return -EINVAL; return 0; } static int __init test_c1(void) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; const u8 r[16] = { 0xe0, 0x2e, 0x70, 0xc6, 0x4e, 0x27, 0x88, 0x63, 0x0e, 0x6f, 0xad, 0x56, 0x21, 0xd5, 0x83, 0x57 }; const u8 preq[7] = { 0x01, 0x01, 0x00, 0x00, 0x10, 0x07, 0x07 }; const u8 pres[7] = { 0x02, 0x03, 0x00, 0x00, 0x08, 0x00, 0x05 }; const u8 _iat = 0x01; const u8 _rat = 0x00; const bdaddr_t ra = { { 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1 } }; const bdaddr_t ia = { { 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1 } }; const u8 exp[16] = { 0x86, 0x3b, 0xf1, 0xbe, 0xc5, 0x4d, 0xa7, 0xd2, 0xea, 0x88, 0x89, 0x87, 0xef, 0x3f, 0x1e, 0x1e }; u8 res[16]; int err; err = smp_c1(k, r, preq, pres, _iat, &ia, _rat, &ra, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_s1(void) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; const u8 r1[16] = { 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11 }; const u8 r2[16] = { 0x00, 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99 }; const u8 exp[16] = { 0x62, 0xa0, 0x6d, 0x79, 0xae, 0x16, 0x42, 0x5b, 0x9b, 0xf4, 0xb0, 0xe8, 0xf0, 0xe1, 0x1f, 0x9a }; u8 res[16]; int err; err = smp_s1(k, r1, r2, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_f4(struct crypto_shash *tfm_cmac) { const u8 u[32] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef, 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e, 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20 }; const u8 v[32] = { 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b, 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59, 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90, 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55 }; const u8 x[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 z = 0x00; const u8 exp[16] = { 0x2d, 0x87, 0x74, 0xa9, 0xbe, 0xa1, 0xed, 0xf1, 0x1c, 0xbd, 0xa9, 0x07, 0xf1, 0x16, 0xc9, 0xf2 }; u8 res[16]; int err; err = smp_f4(tfm_cmac, u, v, x, z, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_f5(struct crypto_shash *tfm_cmac) { const u8 w[32] = { 0x98, 0xa6, 0xbf, 0x73, 0xf3, 0x34, 0x8d, 0x86, 0xf1, 0x66, 0xf8, 0xb4, 0x13, 0x6b, 0x79, 0x99, 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec }; const u8 n1[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 n2[16] = { 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21, 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 }; const u8 a1[7] = { 0xce, 0xbf, 0x37, 0x37, 0x12, 0x56, 0x00 }; const u8 a2[7] = { 0xc1, 0xcf, 0x2d, 0x70, 0x13, 0xa7, 0x00 }; const u8 exp_ltk[16] = { 0x38, 0x0a, 0x75, 0x94, 0xb5, 0x22, 0x05, 0x98, 0x23, 0xcd, 0xd7, 0x69, 0x11, 0x79, 0x86, 0x69 }; const u8 exp_mackey[16] = { 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd, 0x02, 0x4a, 0x08, 0xa1, 0x76, 0xf1, 0x65, 0x29 }; u8 mackey[16], ltk[16]; int err; err = smp_f5(tfm_cmac, w, n1, n2, a1, a2, mackey, ltk); if (err) return err; if (crypto_memneq(mackey, exp_mackey, 16)) return -EINVAL; if (crypto_memneq(ltk, exp_ltk, 16)) return -EINVAL; return 0; } static int __init test_f6(struct crypto_shash *tfm_cmac) { const u8 w[16] = { 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd, 0x02, 0x4a, 0x08, 0xa1, 0x76, 0xf1, 0x65, 0x29 }; const u8 n1[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 n2[16] = { 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21, 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 }; const u8 r[16] = { 0xc8, 0x0f, 0x2d, 0x0c, 0xd2, 0x42, 0xda, 0x08, 0x54, 0xbb, 0x53, 0xb4, 0x3b, 0x34, 0xa3, 0x12 }; const u8 io_cap[3] = { 0x02, 0x01, 0x01 }; const u8 a1[7] = { 0xce, 0xbf, 0x37, 0x37, 0x12, 0x56, 0x00 }; const u8 a2[7] = { 0xc1, 0xcf, 0x2d, 0x70, 0x13, 0xa7, 0x00 }; const u8 exp[16] = { 0x61, 0x8f, 0x95, 0xda, 0x09, 0x0b, 0x6c, 0xd2, 0xc5, 0xe8, 0xd0, 0x9c, 0x98, 0x73, 0xc4, 0xe3 }; u8 res[16]; int err; err = smp_f6(tfm_cmac, w, n1, n2, r, io_cap, a1, a2, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_g2(struct crypto_shash *tfm_cmac) { const u8 u[32] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef, 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e, 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20 }; const u8 v[32] = { 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b, 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59, 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90, 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55 }; const u8 x[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 y[16] = { 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21, 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 }; const u32 exp_val = 0x2f9ed5ba % 1000000; u32 val; int err; err = smp_g2(tfm_cmac, u, v, x, y, &val); if (err) return err; if (val != exp_val) return -EINVAL; return 0; } static int __init test_h6(struct crypto_shash *tfm_cmac) { const u8 w[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec }; const u8 key_id[4] = { 0x72, 0x62, 0x65, 0x6c }; const u8 exp[16] = { 0x99, 0x63, 0xb1, 0x80, 0xe2, 0xa9, 0xd3, 0xe8, 0x1c, 0xc9, 0x6d, 0xe7, 0x02, 0xe1, 0x9a, 0x2d }; u8 res[16]; int err; err = smp_h6(tfm_cmac, w, key_id, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static char test_smp_buffer[32]; static ssize_t test_smp_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { return simple_read_from_buffer(user_buf, count, ppos, test_smp_buffer, strlen(test_smp_buffer)); } static const struct file_operations test_smp_fops = { .open = simple_open, .read = test_smp_read, .llseek = default_llseek, }; static int __init run_selftests(struct crypto_shash *tfm_cmac, struct crypto_kpp *tfm_ecdh) { ktime_t calltime, delta, rettime; unsigned long long duration; int err; calltime = ktime_get(); err = test_debug_key(tfm_ecdh); if (err) { BT_ERR("debug_key test failed"); goto done; } err = test_ah(); if (err) { BT_ERR("smp_ah test failed"); goto done; } err = test_c1(); if (err) { BT_ERR("smp_c1 test failed"); goto done; } err = test_s1(); if (err) { BT_ERR("smp_s1 test failed"); goto done; } err = test_f4(tfm_cmac); if (err) { BT_ERR("smp_f4 test failed"); goto done; } err = test_f5(tfm_cmac); if (err) { BT_ERR("smp_f5 test failed"); goto done; } err = test_f6(tfm_cmac); if (err) { BT_ERR("smp_f6 test failed"); goto done; } err = test_g2(tfm_cmac); if (err) { BT_ERR("smp_g2 test failed"); goto done; } err = test_h6(tfm_cmac); if (err) { BT_ERR("smp_h6 test failed"); goto done; } rettime = ktime_get(); delta = ktime_sub(rettime, calltime); duration = (unsigned long long) ktime_to_ns(delta) >> 10; BT_INFO("SMP test passed in %llu usecs", duration); done: if (!err) snprintf(test_smp_buffer, sizeof(test_smp_buffer), "PASS (%llu usecs)\n", duration); else snprintf(test_smp_buffer, sizeof(test_smp_buffer), "FAIL\n"); debugfs_create_file("selftest_smp", 0444, bt_debugfs, NULL, &test_smp_fops); return err; } int __init bt_selftest_smp(void) { struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; int err; tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); return PTR_ERR(tfm_cmac); } tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0); if (IS_ERR(tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); return PTR_ERR(tfm_ecdh); } err = run_selftests(tfm_cmac, tfm_ecdh); crypto_free_shash(tfm_cmac); crypto_free_kpp(tfm_ecdh); return err; } #endif |
| 2 2 2 2 2 2 2 2 2 2 2 2 6 6 6 2 2 2 3 3 3 3 3 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 31 31 31 31 30 7 7 7 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 6 2 2 2 2 2 2 1 2 4 3 3 1 3 3 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 | // SPDX-License-Identifier: GPL-2.0 /* * PCI Bus Services, see include/linux/pci.h for further explanation. * * Copyright 1993 -- 1997 Drew Eckhardt, Frederic Potter, * David Mosberger-Tang * * Copyright 1997 -- 2000 Martin Mares <mj@ucw.cz> */ #include <linux/acpi.h> #include <linux/kernel.h> #include <linux/delay.h> #include <linux/dmi.h> #include <linux/init.h> #include <linux/msi.h> #include <linux/of.h> #include <linux/pci.h> #include <linux/pm.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/log2.h> #include <linux/logic_pio.h> #include <linux/device.h> #include <linux/pm_runtime.h> #include <linux/pci_hotplug.h> #include <linux/vmalloc.h> #include <asm/dma.h> #include <linux/aer.h> #include <linux/bitfield.h> #include "pci.h" DEFINE_MUTEX(pci_slot_mutex); const char *pci_power_names[] = { "error", "D0", "D1", "D2", "D3hot", "D3cold", "unknown", }; EXPORT_SYMBOL_GPL(pci_power_names); #ifdef CONFIG_X86_32 int isa_dma_bridge_buggy; EXPORT_SYMBOL(isa_dma_bridge_buggy); #endif int pci_pci_problems; EXPORT_SYMBOL(pci_pci_problems); unsigned int pci_pm_d3hot_delay; static void pci_pme_list_scan(struct work_struct *work); static LIST_HEAD(pci_pme_list); static DEFINE_MUTEX(pci_pme_list_mutex); static DECLARE_DELAYED_WORK(pci_pme_work, pci_pme_list_scan); struct pci_pme_device { struct list_head list; struct pci_dev *dev; }; #define PME_TIMEOUT 1000 /* How long between PME checks */ /* * Following exit from Conventional Reset, devices must be ready within 1 sec * (PCIe r6.0 sec 6.6.1). A D3cold to D0 transition implies a Conventional * Reset (PCIe r6.0 sec 5.8). */ #define PCI_RESET_WAIT 1000 /* msec */ /* * Devices may extend the 1 sec period through Request Retry Status * completions (PCIe r6.0 sec 2.3.1). The spec does not provide an upper * limit, but 60 sec ought to be enough for any device to become * responsive. */ #define PCIE_RESET_READY_POLL_MS 60000 /* msec */ static void pci_dev_d3_sleep(struct pci_dev *dev) { unsigned int delay_ms = max(dev->d3hot_delay, pci_pm_d3hot_delay); unsigned int upper; if (delay_ms) { /* Use a 20% upper bound, 1ms minimum */ upper = max(DIV_ROUND_CLOSEST(delay_ms, 5), 1U); usleep_range(delay_ms * USEC_PER_MSEC, (delay_ms + upper) * USEC_PER_MSEC); } } bool pci_reset_supported(struct pci_dev *dev) { return dev->reset_methods[0] != 0; } #ifdef CONFIG_PCI_DOMAINS int pci_domains_supported = 1; #endif #define DEFAULT_CARDBUS_IO_SIZE (256) #define DEFAULT_CARDBUS_MEM_SIZE (64*1024*1024) /* pci=cbmemsize=nnM,cbiosize=nn can override this */ unsigned long pci_cardbus_io_size = DEFAULT_CARDBUS_IO_SIZE; unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; #define DEFAULT_HOTPLUG_IO_SIZE (256) #define DEFAULT_HOTPLUG_MMIO_SIZE (2*1024*1024) #define DEFAULT_HOTPLUG_MMIO_PREF_SIZE (2*1024*1024) /* hpiosize=nn can override this */ unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE; /* * pci=hpmmiosize=nnM overrides non-prefetchable MMIO size, * pci=hpmmioprefsize=nnM overrides prefetchable MMIO size; * pci=hpmemsize=nnM overrides both */ unsigned long pci_hotplug_mmio_size = DEFAULT_HOTPLUG_MMIO_SIZE; unsigned long pci_hotplug_mmio_pref_size = DEFAULT_HOTPLUG_MMIO_PREF_SIZE; #define DEFAULT_HOTPLUG_BUS_SIZE 1 unsigned long pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE; /* PCIe MPS/MRRS strategy; can be overridden by kernel command-line param */ #ifdef CONFIG_PCIE_BUS_TUNE_OFF enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_TUNE_OFF; #elif defined CONFIG_PCIE_BUS_SAFE enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_SAFE; #elif defined CONFIG_PCIE_BUS_PERFORMANCE enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE; #elif defined CONFIG_PCIE_BUS_PEER2PEER enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PEER2PEER; #else enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_DEFAULT; #endif /* * The default CLS is used if arch didn't set CLS explicitly and not * all pci devices agree on the same value. Arch can override either * the dfl or actual value as it sees fit. Don't forget this is * measured in 32-bit words, not bytes. */ u8 pci_dfl_cache_line_size __ro_after_init = L1_CACHE_BYTES >> 2; u8 pci_cache_line_size __ro_after_init ; /* * If we set up a device for bus mastering, we need to check the latency * timer as certain BIOSes forget to set it properly. */ unsigned int pcibios_max_latency = 255; /* If set, the PCIe ARI capability will not be used. */ static bool pcie_ari_disabled; /* If set, the PCIe ATS capability will not be used. */ static bool pcie_ats_disabled; /* If set, the PCI config space of each device is printed during boot. */ bool pci_early_dump; bool pci_ats_disabled(void) { return pcie_ats_disabled; } EXPORT_SYMBOL_GPL(pci_ats_disabled); /* Disable bridge_d3 for all PCIe ports */ static bool pci_bridge_d3_disable; /* Force bridge_d3 for all PCIe ports */ static bool pci_bridge_d3_force; static int __init pcie_port_pm_setup(char *str) { if (!strcmp(str, "off")) pci_bridge_d3_disable = true; else if (!strcmp(str, "force")) pci_bridge_d3_force = true; return 1; } __setup("pcie_port_pm=", pcie_port_pm_setup); /** * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children * @bus: pointer to PCI bus structure to search * * Given a PCI bus, returns the highest PCI bus number present in the set * including the given PCI bus and its list of child PCI buses. */ unsigned char pci_bus_max_busnr(struct pci_bus *bus) { struct pci_bus *tmp; unsigned char max, n; max = bus->busn_res.end; list_for_each_entry(tmp, &bus->children, node) { n = pci_bus_max_busnr(tmp); if (n > max) max = n; } return max; } EXPORT_SYMBOL_GPL(pci_bus_max_busnr); /** * pci_status_get_and_clear_errors - return and clear error bits in PCI_STATUS * @pdev: the PCI device * * Returns error bits set in PCI_STATUS and clears them. */ int pci_status_get_and_clear_errors(struct pci_dev *pdev) { u16 status; int ret; ret = pci_read_config_word(pdev, PCI_STATUS, &status); if (ret != PCIBIOS_SUCCESSFUL) return -EIO; status &= PCI_STATUS_ERROR_BITS; if (status) pci_write_config_word(pdev, PCI_STATUS, status); return status; } EXPORT_SYMBOL_GPL(pci_status_get_and_clear_errors); #ifdef CONFIG_HAS_IOMEM static void __iomem *__pci_ioremap_resource(struct pci_dev *pdev, int bar, bool write_combine) { struct resource *res = &pdev->resource[bar]; resource_size_t start = res->start; resource_size_t size = resource_size(res); /* * Make sure the BAR is actually a memory resource, not an IO resource */ if (res->flags & IORESOURCE_UNSET || !(res->flags & IORESOURCE_MEM)) { pci_err(pdev, "can't ioremap BAR %d: %pR\n", bar, res); return NULL; } if (write_combine) return ioremap_wc(start, size); return ioremap(start, size); } void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar) { return __pci_ioremap_resource(pdev, bar, false); } EXPORT_SYMBOL_GPL(pci_ioremap_bar); void __iomem *pci_ioremap_wc_bar(struct pci_dev *pdev, int bar) { return __pci_ioremap_resource(pdev, bar, true); } EXPORT_SYMBOL_GPL(pci_ioremap_wc_bar); #endif /** * pci_dev_str_match_path - test if a path string matches a device * @dev: the PCI device to test * @path: string to match the device against * @endptr: pointer to the string after the match * * Test if a string (typically from a kernel parameter) formatted as a * path of device/function addresses matches a PCI device. The string must * be of the form: * * [<domain>:]<bus>:<device>.<func>[/<device>.<func>]* * * A path for a device can be obtained using 'lspci -t'. Using a path * is more robust against bus renumbering than using only a single bus, * device and function address. * * Returns 1 if the string matches the device, 0 if it does not and * a negative error code if it fails to parse the string. */ static int pci_dev_str_match_path(struct pci_dev *dev, const char *path, const char **endptr) { int ret; unsigned int seg, bus, slot, func; char *wpath, *p; char end; *endptr = strchrnul(path, ';'); wpath = kmemdup_nul(path, *endptr - path, GFP_ATOMIC); if (!wpath) return -ENOMEM; while (1) { p = strrchr(wpath, '/'); if (!p) break; ret = sscanf(p, "/%x.%x%c", &slot, &func, &end); if (ret != 2) { ret = -EINVAL; goto free_and_exit; } if (dev->devfn != PCI_DEVFN(slot, func)) { ret = 0; goto free_and_exit; } /* * Note: we don't need to get a reference to the upstream * bridge because we hold a reference to the top level * device which should hold a reference to the bridge, * and so on. */ dev = pci_upstream_bridge(dev); if (!dev) { ret = 0; goto free_and_exit; } *p = 0; } ret = sscanf(wpath, "%x:%x:%x.%x%c", &seg, &bus, &slot, &func, &end); if (ret != 4) { seg = 0; ret = sscanf(wpath, "%x:%x.%x%c", &bus, &slot, &func, &end); if (ret != 3) { ret = -EINVAL; goto free_and_exit; } } ret = (seg == pci_domain_nr(dev->bus) && bus == dev->bus->number && dev->devfn == PCI_DEVFN(slot, func)); free_and_exit: kfree(wpath); return ret; } /** * pci_dev_str_match - test if a string matches a device * @dev: the PCI device to test * @p: string to match the device against * @endptr: pointer to the string after the match * * Test if a string (typically from a kernel parameter) matches a specified * PCI device. The string may be of one of the following formats: * * [<domain>:]<bus>:<device>.<func>[/<device>.<func>]* * pci:<vendor>:<device>[:<subvendor>:<subdevice>] * * The first format specifies a PCI bus/device/function address which * may change if new hardware is inserted, if motherboard firmware changes, * or due to changes caused in kernel parameters. If the domain is * left unspecified, it is taken to be 0. In order to be robust against * bus renumbering issues, a path of PCI device/function numbers may be used * to address the specific device. The path for a device can be determined * through the use of 'lspci -t'. * * The second format matches devices using IDs in the configuration * space which may match multiple devices in the system. A value of 0 * for any field will match all devices. (Note: this differs from * in-kernel code that uses PCI_ANY_ID which is ~0; this is for * legacy reasons and convenience so users don't have to specify * FFFFFFFFs on the command line.) * * Returns 1 if the string matches the device, 0 if it does not and * a negative error code if the string cannot be parsed. */ static int pci_dev_str_match(struct pci_dev *dev, const char *p, const char **endptr) { int ret; int count; unsigned short vendor, device, subsystem_vendor, subsystem_device; if (strncmp(p, "pci:", 4) == 0) { /* PCI vendor/device (subvendor/subdevice) IDs are specified */ p += 4; ret = sscanf(p, "%hx:%hx:%hx:%hx%n", &vendor, &device, &subsystem_vendor, &subsystem_device, &count); if (ret != 4) { ret = sscanf(p, "%hx:%hx%n", &vendor, &device, &count); if (ret != 2) return -EINVAL; subsystem_vendor = 0; subsystem_device = 0; } p += count; if ((!vendor || vendor == dev->vendor) && (!device || device == dev->device) && (!subsystem_vendor || subsystem_vendor == dev->subsystem_vendor) && (!subsystem_device || subsystem_device == dev->subsystem_device)) goto found; } else { /* * PCI Bus, Device, Function IDs are specified * (optionally, may include a path of devfns following it) */ ret = pci_dev_str_match_path(dev, p, &p); if (ret < 0) return ret; else if (ret) goto found; } *endptr = p; return 0; found: *endptr = p; return 1; } static u8 __pci_find_next_cap(struct pci_bus *bus, unsigned int devfn, u8 pos, int cap) { return PCI_FIND_NEXT_CAP(pci_bus_read_config, pos, cap, bus, devfn); } u8 pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap) { return __pci_find_next_cap(dev->bus, dev->devfn, pos + PCI_CAP_LIST_NEXT, cap); } EXPORT_SYMBOL_GPL(pci_find_next_capability); static u8 __pci_bus_find_cap_start(struct pci_bus *bus, unsigned int devfn, u8 hdr_type) { u16 status; pci_bus_read_config_word(bus, devfn, PCI_STATUS, &status); if (!(status & PCI_STATUS_CAP_LIST)) return 0; switch (hdr_type) { case PCI_HEADER_TYPE_NORMAL: case PCI_HEADER_TYPE_BRIDGE: return PCI_CAPABILITY_LIST; case PCI_HEADER_TYPE_CARDBUS: return PCI_CB_CAPABILITY_LIST; } return 0; } /** * pci_find_capability - query for devices' capabilities * @dev: PCI device to query * @cap: capability code * * Tell if a device supports a given PCI capability. * Returns the address of the requested capability structure within the * device's PCI configuration space or 0 in case the device does not * support it. Possible values for @cap include: * * %PCI_CAP_ID_PM Power Management * %PCI_CAP_ID_AGP Accelerated Graphics Port * %PCI_CAP_ID_VPD Vital Product Data * %PCI_CAP_ID_SLOTID Slot Identification * %PCI_CAP_ID_MSI Message Signalled Interrupts * %PCI_CAP_ID_CHSWP CompactPCI HotSwap * %PCI_CAP_ID_PCIX PCI-X * %PCI_CAP_ID_EXP PCI Express */ u8 pci_find_capability(struct pci_dev *dev, int cap) { u8 pos; pos = __pci_bus_find_cap_start(dev->bus, dev->devfn, dev->hdr_type); if (pos) pos = __pci_find_next_cap(dev->bus, dev->devfn, pos, cap); return pos; } EXPORT_SYMBOL(pci_find_capability); /** * pci_bus_find_capability - query for devices' capabilities * @bus: the PCI bus to query * @devfn: PCI device to query * @cap: capability code * * Like pci_find_capability() but works for PCI devices that do not have a * pci_dev structure set up yet. * * Returns the address of the requested capability structure within the * device's PCI configuration space or 0 in case the device does not * support it. */ u8 pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap) { u8 hdr_type, pos; pci_bus_read_config_byte(bus, devfn, PCI_HEADER_TYPE, &hdr_type); pos = __pci_bus_find_cap_start(bus, devfn, hdr_type & PCI_HEADER_TYPE_MASK); if (pos) pos = __pci_find_next_cap(bus, devfn, pos, cap); return pos; } EXPORT_SYMBOL(pci_bus_find_capability); /** * pci_find_next_ext_capability - Find an extended capability * @dev: PCI device to query * @start: address at which to start looking (0 to start at beginning of list) * @cap: capability code * * Returns the address of the next matching extended capability structure * within the device's PCI configuration space or 0 if the device does * not support it. Some capabilities can occur several times, e.g., the * vendor-specific capability, and this provides a way to find them all. */ u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 start, int cap) { if (dev->cfg_size <= PCI_CFG_SPACE_SIZE) return 0; return PCI_FIND_NEXT_EXT_CAP(pci_bus_read_config, start, cap, dev->bus, dev->devfn); } EXPORT_SYMBOL_GPL(pci_find_next_ext_capability); /** * pci_find_ext_capability - Find an extended capability * @dev: PCI device to query * @cap: capability code * * Returns the address of the requested extended capability structure * within the device's PCI configuration space or 0 if the device does * not support it. Possible values for @cap include: * * %PCI_EXT_CAP_ID_ERR Advanced Error Reporting * %PCI_EXT_CAP_ID_VC Virtual Channel * %PCI_EXT_CAP_ID_DSN Device Serial Number * %PCI_EXT_CAP_ID_PWR Power Budgeting */ u16 pci_find_ext_capability(struct pci_dev *dev, int cap) { return pci_find_next_ext_capability(dev, 0, cap); } EXPORT_SYMBOL_GPL(pci_find_ext_capability); /** * pci_get_dsn - Read and return the 8-byte Device Serial Number * @dev: PCI device to query * * Looks up the PCI_EXT_CAP_ID_DSN and reads the 8 bytes of the Device Serial * Number. * * Returns the DSN, or zero if the capability does not exist. */ u64 pci_get_dsn(struct pci_dev *dev) { u32 dword; u64 dsn; int pos; pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DSN); if (!pos) return 0; /* * The Device Serial Number is two dwords offset 4 bytes from the * capability position. The specification says that the first dword is * the lower half, and the second dword is the upper half. */ pos += 4; pci_read_config_dword(dev, pos, &dword); dsn = (u64)dword; pci_read_config_dword(dev, pos + 4, &dword); dsn |= ((u64)dword) << 32; return dsn; } EXPORT_SYMBOL_GPL(pci_get_dsn); static u8 __pci_find_next_ht_cap(struct pci_dev *dev, u8 pos, int ht_cap) { int rc; u8 cap, mask; if (ht_cap == HT_CAPTYPE_SLAVE || ht_cap == HT_CAPTYPE_HOST) mask = HT_3BIT_CAP_MASK; else mask = HT_5BIT_CAP_MASK; pos = PCI_FIND_NEXT_CAP(pci_bus_read_config, pos, PCI_CAP_ID_HT, dev->bus, dev->devfn); while (pos) { rc = pci_read_config_byte(dev, pos + 3, &cap); if (rc != PCIBIOS_SUCCESSFUL) return 0; if ((cap & mask) == ht_cap) return pos; pos = PCI_FIND_NEXT_CAP(pci_bus_read_config, pos + PCI_CAP_LIST_NEXT, PCI_CAP_ID_HT, dev->bus, dev->devfn); } return 0; } /** * pci_find_next_ht_capability - query a device's HyperTransport capabilities * @dev: PCI device to query * @pos: Position from which to continue searching * @ht_cap: HyperTransport capability code * * To be used in conjunction with pci_find_ht_capability() to search for * all capabilities matching @ht_cap. @pos should always be a value returned * from pci_find_ht_capability(). * * NB. To be 100% safe against broken PCI devices, the caller should take * steps to avoid an infinite loop. */ u8 pci_find_next_ht_capability(struct pci_dev *dev, u8 pos, int ht_cap) { return __pci_find_next_ht_cap(dev, pos + PCI_CAP_LIST_NEXT, ht_cap); } EXPORT_SYMBOL_GPL(pci_find_next_ht_capability); /** * pci_find_ht_capability - query a device's HyperTransport capabilities * @dev: PCI device to query * @ht_cap: HyperTransport capability code * * Tell if a device supports a given HyperTransport capability. * Returns an address within the device's PCI configuration space * or 0 in case the device does not support the request capability. * The address points to the PCI capability, of type PCI_CAP_ID_HT, * which has a HyperTransport capability matching @ht_cap. */ u8 pci_find_ht_capability(struct pci_dev *dev, int ht_cap) { u8 pos; pos = __pci_bus_find_cap_start(dev->bus, dev->devfn, dev->hdr_type); if (pos) pos = __pci_find_next_ht_cap(dev, pos, ht_cap); return pos; } EXPORT_SYMBOL_GPL(pci_find_ht_capability); /** * pci_find_vsec_capability - Find a vendor-specific extended capability * @dev: PCI device to query * @vendor: Vendor ID for which capability is defined * @cap: Vendor-specific capability ID * * If @dev has Vendor ID @vendor, search for a VSEC capability with * VSEC ID @cap. If found, return the capability offset in * config space; otherwise return 0. */ u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap) { u16 vsec = 0; u32 header; int ret; if (vendor != dev->vendor) return 0; while ((vsec = pci_find_next_ext_capability(dev, vsec, PCI_EXT_CAP_ID_VNDR))) { ret = pci_read_config_dword(dev, vsec + PCI_VNDR_HEADER, &header); if (ret != PCIBIOS_SUCCESSFUL) continue; if (PCI_VNDR_HEADER_ID(header) == cap) return vsec; } return 0; } EXPORT_SYMBOL_GPL(pci_find_vsec_capability); /** * pci_find_dvsec_capability - Find DVSEC for vendor * @dev: PCI device to query * @vendor: Vendor ID to match for the DVSEC * @dvsec: Designated Vendor-specific capability ID * * If DVSEC has Vendor ID @vendor and DVSEC ID @dvsec return the capability * offset in config space; otherwise return 0. */ u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec) { int pos; pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DVSEC); if (!pos) return 0; while (pos) { u16 v, id; pci_read_config_word(dev, pos + PCI_DVSEC_HEADER1, &v); pci_read_config_word(dev, pos + PCI_DVSEC_HEADER2, &id); if (vendor == v && dvsec == id) return pos; pos = pci_find_next_ext_capability(dev, pos, PCI_EXT_CAP_ID_DVSEC); } return 0; } EXPORT_SYMBOL_GPL(pci_find_dvsec_capability); /** * pci_find_parent_resource - return resource region of parent bus of given * region * @dev: PCI device structure contains resources to be searched * @res: child resource record for which parent is sought * * For given resource region of given device, return the resource region of * parent bus the given region is contained in. */ struct resource *pci_find_parent_resource(const struct pci_dev *dev, struct resource *res) { const struct pci_bus *bus = dev->bus; struct resource *r; pci_bus_for_each_resource(bus, r) { if (!r) continue; if (resource_contains(r, res)) { /* * If the window is prefetchable but the BAR is * not, the allocator made a mistake. */ if (r->flags & IORESOURCE_PREFETCH && !(res->flags & IORESOURCE_PREFETCH)) return NULL; /* * If we're below a transparent bridge, there may * be both a positively-decoded aperture and a * subtractively-decoded region that contain the BAR. * We want the positively-decoded one, so this depends * on pci_bus_for_each_resource() giving us those * first. */ return r; } } return NULL; } EXPORT_SYMBOL(pci_find_parent_resource); /** * pci_find_resource - Return matching PCI device resource * @dev: PCI device to query * @res: Resource to look for * * Goes over standard PCI resources (BARs) and checks if the given resource * is partially or fully contained in any of them. In that case the * matching resource is returned, %NULL otherwise. */ struct resource *pci_find_resource(struct pci_dev *dev, struct resource *res) { int i; for (i = 0; i < PCI_STD_NUM_BARS; i++) { struct resource *r = &dev->resource[i]; if (r->start && resource_contains(r, res)) return r; } return NULL; } EXPORT_SYMBOL(pci_find_resource); /** * pci_resource_name - Return the name of the PCI resource * @dev: PCI device to query * @i: index of the resource * * Return the standard PCI resource (BAR) name according to their index. */ const char *pci_resource_name(struct pci_dev *dev, unsigned int i) { static const char * const bar_name[] = { "BAR 0", "BAR 1", "BAR 2", "BAR 3", "BAR 4", "BAR 5", "ROM", #ifdef CONFIG_PCI_IOV "VF BAR 0", "VF BAR 1", "VF BAR 2", "VF BAR 3", "VF BAR 4", "VF BAR 5", #endif "bridge window", /* "io" included in %pR */ "bridge window", /* "mem" included in %pR */ "bridge window", /* "mem pref" included in %pR */ }; static const char * const cardbus_name[] = { "BAR 1", "unknown", "unknown", "unknown", "unknown", "unknown", #ifdef CONFIG_PCI_IOV "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", #endif "CardBus bridge window 0", /* I/O */ "CardBus bridge window 1", /* I/O */ "CardBus bridge window 0", /* mem */ "CardBus bridge window 1", /* mem */ }; if (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS && i < ARRAY_SIZE(cardbus_name)) return cardbus_name[i]; if (i < ARRAY_SIZE(bar_name)) return bar_name[i]; return "unknown"; } /** * pci_wait_for_pending - wait for @mask bit(s) to clear in status word @pos * @dev: the PCI device to operate on * @pos: config space offset of status word * @mask: mask of bit(s) to care about in status word * * Return 1 when mask bit(s) in status word clear, 0 otherwise. */ int pci_wait_for_pending(struct pci_dev *dev, int pos, u16 mask) { int i; /* Wait for Transaction Pending bit clean */ for (i = 0; i < 4; i++) { u16 status; if (i) msleep((1 << (i - 1)) * 100); pci_read_config_word(dev, pos, &status); if (!(status & mask)) return 1; } return 0; } static int pci_acs_enable; /** * pci_request_acs - ask for ACS to be enabled if supported */ void pci_request_acs(void) { pci_acs_enable = 1; } static const char *disable_acs_redir_param; static const char *config_acs_param; struct pci_acs { u16 cap; u16 ctrl; u16 fw_ctrl; }; static void __pci_config_acs(struct pci_dev *dev, struct pci_acs *caps, const char *p, const u16 acs_mask, const u16 acs_flags) { u16 flags = acs_flags; u16 mask = acs_mask; char *delimit; int ret = 0; if (!p) return; while (*p) { if (!acs_mask) { /* Check for ACS flags */ delimit = strstr(p, "@"); if (delimit) { int end; u32 shift = 0; end = delimit - p - 1; mask = 0; flags = 0; while (end > -1) { if (*(p + end) == '0') { mask |= 1 << shift; shift++; end--; } else if (*(p + end) == '1') { mask |= 1 << shift; flags |= 1 << shift; shift++; end--; } else if ((*(p + end) == 'x') || (*(p + end) == 'X')) { shift++; end--; } else { pci_err(dev, "Invalid ACS flags... Ignoring\n"); return; } } p = delimit + 1; } else { pci_err(dev, "ACS Flags missing\n"); return; } } if (mask & ~(PCI_ACS_SV | PCI_ACS_TB | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF | PCI_ACS_EC | PCI_ACS_DT)) { pci_err(dev, "Invalid ACS flags specified\n"); return; } ret = pci_dev_str_match(dev, p, &p); if (ret < 0) { pr_info_once("PCI: Can't parse ACS command line parameter\n"); break; } else if (ret == 1) { /* Found a match */ break; } if (*p != ';' && *p != ',') { /* End of param or invalid format */ break; } p++; } if (ret != 1) return; if (!pci_dev_specific_disable_acs_redir(dev)) return; pci_dbg(dev, "ACS mask = %#06x\n", mask); pci_dbg(dev, "ACS flags = %#06x\n", flags); pci_dbg(dev, "ACS control = %#06x\n", caps->ctrl); pci_dbg(dev, "ACS fw_ctrl = %#06x\n", caps->fw_ctrl); /* * For mask bits that are 0, copy them from the firmware setting * and apply flags for all the mask bits that are 1. */ caps->ctrl = (caps->fw_ctrl & ~mask) | (flags & mask); pci_info(dev, "Configured ACS to %#06x\n", caps->ctrl); } /** * pci_std_enable_acs - enable ACS on devices using standard ACS capabilities * @dev: the PCI device * @caps: default ACS controls */ static void pci_std_enable_acs(struct pci_dev *dev, struct pci_acs *caps) { /* Source Validation */ caps->ctrl |= (caps->cap & PCI_ACS_SV); /* P2P Request Redirect */ caps->ctrl |= (caps->cap & PCI_ACS_RR); /* P2P Completion Redirect */ caps->ctrl |= (caps->cap & PCI_ACS_CR); /* Upstream Forwarding */ caps->ctrl |= (caps->cap & PCI_ACS_UF); /* Enable Translation Blocking for external devices and noats */ if (pci_ats_disabled() || dev->external_facing || dev->untrusted) caps->ctrl |= (caps->cap & PCI_ACS_TB); } /** * pci_enable_acs - enable ACS if hardware support it * @dev: the PCI device */ static void pci_enable_acs(struct pci_dev *dev) { struct pci_acs caps; bool enable_acs = false; int pos; /* If an iommu is present we start with kernel default caps */ if (pci_acs_enable) { if (pci_dev_specific_enable_acs(dev)) enable_acs = true; } pos = dev->acs_cap; if (!pos) return; pci_read_config_word(dev, pos + PCI_ACS_CAP, &caps.cap); pci_read_config_word(dev, pos + PCI_ACS_CTRL, &caps.ctrl); caps.fw_ctrl = caps.ctrl; if (enable_acs) pci_std_enable_acs(dev, &caps); /* * Always apply caps from the command line, even if there is no iommu. * Trust that the admin has a reason to change the ACS settings. */ __pci_config_acs(dev, &caps, disable_acs_redir_param, PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC, ~(PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC)); __pci_config_acs(dev, &caps, config_acs_param, 0, 0); pci_write_config_word(dev, pos + PCI_ACS_CTRL, caps.ctrl); } /** * pci_restore_bars - restore a device's BAR values (e.g. after wake-up) * @dev: PCI device to have its BARs restored * * Restore the BAR values for a given device, so as to make it * accessible by its driver. */ static void pci_restore_bars(struct pci_dev *dev) { int i; for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) pci_update_resource(dev, i); } static inline bool platform_pci_power_manageable(struct pci_dev *dev) { if (pci_use_mid_pm()) return true; return acpi_pci_power_manageable(dev); } static inline int platform_pci_set_power_state(struct pci_dev *dev, pci_power_t t) { if (pci_use_mid_pm()) return mid_pci_set_power_state(dev, t); return acpi_pci_set_power_state(dev, t); } static inline pci_power_t platform_pci_get_power_state(struct pci_dev *dev) { if (pci_use_mid_pm()) return mid_pci_get_power_state(dev); return acpi_pci_get_power_state(dev); } static inline void platform_pci_refresh_power_state(struct pci_dev *dev) { if (!pci_use_mid_pm()) acpi_pci_refresh_power_state(dev); } static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev) { if (pci_use_mid_pm()) return PCI_POWER_ERROR; return acpi_pci_choose_state(dev); } static inline int platform_pci_set_wakeup(struct pci_dev *dev, bool enable) { if (pci_use_mid_pm()) return PCI_POWER_ERROR; return acpi_pci_wakeup(dev, enable); } static inline bool platform_pci_need_resume(struct pci_dev *dev) { if (pci_use_mid_pm()) return false; return acpi_pci_need_resume(dev); } static inline bool platform_pci_bridge_d3(struct pci_dev *dev) { if (pci_use_mid_pm()) return false; return acpi_pci_bridge_d3(dev); } /** * pci_update_current_state - Read power state of given device and cache it * @dev: PCI device to handle. * @state: State to cache in case the device doesn't have the PM capability * * The power state is read from the PMCSR register, which however is * inaccessible in D3cold. The platform firmware is therefore queried first * to detect accessibility of the register. In case the platform firmware * reports an incorrect state or the device isn't power manageable by the * platform at all, we try to detect D3cold by testing accessibility of the * vendor ID in config space. */ void pci_update_current_state(struct pci_dev *dev, pci_power_t state) { if (platform_pci_get_power_state(dev) == PCI_D3cold) { dev->current_state = PCI_D3cold; } else if (dev->pm_cap) { u16 pmcsr; pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (PCI_POSSIBLE_ERROR(pmcsr)) { dev->current_state = PCI_D3cold; return; } dev->current_state = pmcsr & PCI_PM_CTRL_STATE_MASK; } else { dev->current_state = state; } } /** * pci_refresh_power_state - Refresh the given device's power state data * @dev: Target PCI device. * * Ask the platform to refresh the devices power state information and invoke * pci_update_current_state() to update its current PCI power state. */ void pci_refresh_power_state(struct pci_dev *dev) { platform_pci_refresh_power_state(dev); pci_update_current_state(dev, dev->current_state); } /** * pci_platform_power_transition - Use platform to change device power state * @dev: PCI device to handle. * @state: State to put the device into. */ int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state) { int error; error = platform_pci_set_power_state(dev, state); if (!error) pci_update_current_state(dev, state); else if (!dev->pm_cap) /* Fall back to PCI_D0 */ dev->current_state = PCI_D0; return error; } EXPORT_SYMBOL_GPL(pci_platform_power_transition); static int pci_resume_one(struct pci_dev *pci_dev, void *ign) { pm_request_resume(&pci_dev->dev); return 0; } /** * pci_resume_bus - Walk given bus and runtime resume devices on it * @bus: Top bus of the subtree to walk. */ void pci_resume_bus(struct pci_bus *bus) { if (bus) pci_walk_bus(bus, pci_resume_one, NULL); } static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout) { int delay = 1; bool retrain = false; struct pci_dev *root, *bridge; root = pcie_find_root_port(dev); if (pci_is_pcie(dev)) { bridge = pci_upstream_bridge(dev); if (bridge) retrain = true; } /* * The caller has already waited long enough after a reset that the * device should respond to config requests, but it may respond * with Request Retry Status (RRS) if it needs more time to * initialize. * * If the device is below a Root Port with Configuration RRS * Software Visibility enabled, reading the Vendor ID returns a * special data value if the device responded with RRS. Read the * Vendor ID until we get non-RRS status. * * If there's no Root Port or Configuration RRS Software Visibility * is not enabled, the device may still respond with RRS, but * hardware may retry the config request. If no retries receive * Successful Completion, hardware generally synthesizes ~0 * (PCI_ERROR_RESPONSE) data to complete the read. Reading Vendor * ID for VFs and non-existent devices also returns ~0, so read the * Command register until it returns something other than ~0. */ for (;;) { u32 id; if (pci_dev_is_disconnected(dev)) { pci_dbg(dev, "disconnected; not waiting\n"); return -ENOTTY; } if (root && root->config_rrs_sv) { pci_read_config_dword(dev, PCI_VENDOR_ID, &id); if (!pci_bus_rrs_vendor_id(id)) break; } else { pci_read_config_dword(dev, PCI_COMMAND, &id); if (!PCI_POSSIBLE_ERROR(id)) break; } if (delay > timeout) { pci_warn(dev, "not ready %dms after %s; giving up\n", delay - 1, reset_type); return -ENOTTY; } if (delay > PCI_RESET_WAIT) { if (retrain) { retrain = false; if (pcie_failed_link_retrain(bridge) == 0) { delay = 1; continue; } } pci_info(dev, "not ready %dms after %s; waiting\n", delay - 1, reset_type); } msleep(delay); delay *= 2; } if (delay > PCI_RESET_WAIT) pci_info(dev, "ready %dms after %s\n", delay - 1, reset_type); else pci_dbg(dev, "ready %dms after %s\n", delay - 1, reset_type); return 0; } /** * pci_power_up - Put the given device into D0 * @dev: PCI device to power up * * On success, return 0 or 1, depending on whether or not it is necessary to * restore the device's BARs subsequently (1 is returned in that case). * * On failure, return a negative error code. Always return failure if @dev * lacks a Power Management Capability, even if the platform was able to * put the device in D0 via non-PCI means. */ int pci_power_up(struct pci_dev *dev) { bool need_restore; pci_power_t state; u16 pmcsr; platform_pci_set_power_state(dev, PCI_D0); if (!dev->pm_cap) { state = platform_pci_get_power_state(dev); if (state == PCI_UNKNOWN) dev->current_state = PCI_D0; else dev->current_state = state; return -EIO; } if (pci_dev_is_disconnected(dev)) { dev->current_state = PCI_D3cold; return -EIO; } pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (PCI_POSSIBLE_ERROR(pmcsr)) { pci_err(dev, "Unable to change power state from %s to D0, device inaccessible\n", pci_power_name(dev->current_state)); dev->current_state = PCI_D3cold; return -EIO; } state = pmcsr & PCI_PM_CTRL_STATE_MASK; need_restore = (state == PCI_D3hot || dev->current_state >= PCI_D3hot) && !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET); if (state == PCI_D0) goto end; /* * Force the entire word to 0. This doesn't affect PME_Status, disables * PME_En, and sets PowerState to 0. */ pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, 0); /* Mandatory transition delays; see PCI PM 1.2. */ if (state == PCI_D3hot) pci_dev_d3_sleep(dev); else if (state == PCI_D2) udelay(PCI_PM_D2_DELAY); end: dev->current_state = PCI_D0; if (need_restore) return 1; return 0; } /** * pci_set_full_power_state - Put a PCI device into D0 and update its state * @dev: PCI device to power up * @locked: whether pci_bus_sem is held * * Call pci_power_up() to put @dev into D0, read from its PCI_PM_CTRL register * to confirm the state change, restore its BARs if they might be lost and * reconfigure ASPM in accordance with the new power state. * * If pci_restore_state() is going to be called right after a power state change * to D0, it is more efficient to use pci_power_up() directly instead of this * function. */ static int pci_set_full_power_state(struct pci_dev *dev, bool locked) { u16 pmcsr; int ret; ret = pci_power_up(dev); if (ret < 0) { if (dev->current_state == PCI_D0) return 0; return ret; } pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); dev->current_state = pmcsr & PCI_PM_CTRL_STATE_MASK; if (dev->current_state != PCI_D0) { pci_info_ratelimited(dev, "Refused to change power state from %s to D0\n", pci_power_name(dev->current_state)); } else if (ret > 0) { /* * According to section 5.4.1 of the "PCI BUS POWER MANAGEMENT * INTERFACE SPECIFICATION, REV. 1.2", a device transitioning * from D3hot to D0 _may_ perform an internal reset, thereby * going to "D0 Uninitialized" rather than "D0 Initialized". * For example, at least some versions of the 3c905B and the * 3c556B exhibit this behaviour. * * At least some laptop BIOSen (e.g. the Thinkpad T21) leave * devices in a D3hot state at boot. Consequently, we need to * restore at least the BARs so that the device will be * accessible to its driver. */ pci_restore_bars(dev); } if (dev->bus->self) pcie_aspm_pm_state_change(dev->bus->self, locked); return 0; } /** * __pci_dev_set_current_state - Set current state of a PCI device * @dev: Device to handle * @data: pointer to state to be set */ static int __pci_dev_set_current_state(struct pci_dev *dev, void *data) { pci_power_t state = *(pci_power_t *)data; dev->current_state = state; return 0; } /** * pci_bus_set_current_state - Walk given bus and set current state of devices * @bus: Top bus of the subtree to walk. * @state: state to be set */ void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state) { if (bus) pci_walk_bus(bus, __pci_dev_set_current_state, &state); } static void __pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state, bool locked) { if (!bus) return; if (locked) pci_walk_bus_locked(bus, __pci_dev_set_current_state, &state); else pci_walk_bus(bus, __pci_dev_set_current_state, &state); } /** * pci_set_low_power_state - Put a PCI device into a low-power state. * @dev: PCI device to handle. * @state: PCI power state (D1, D2, D3hot) to put the device into. * @locked: whether pci_bus_sem is held * * Use the device's PCI_PM_CTRL register to put it into a low-power state. * * RETURN VALUE: * -EINVAL if the requested state is invalid. * -EIO if device does not support PCI PM or its PM capabilities register has a * wrong version, or device doesn't support the requested state. * 0 if device already is in the requested state. * 0 if device's power state has been successfully changed. */ static int pci_set_low_power_state(struct pci_dev *dev, pci_power_t state, bool locked) { u16 pmcsr; if (!dev->pm_cap) return -EIO; /* * Validate transition: We can enter D0 from any state, but if * we're already in a low-power state, we can only go deeper. E.g., * we can go from D1 to D3, but we can't go directly from D3 to D1; * we'd have to go from D3 to D0, then to D1. */ if (dev->current_state <= PCI_D3cold && dev->current_state > state) { pci_dbg(dev, "Invalid power transition (from %s to %s)\n", pci_power_name(dev->current_state), pci_power_name(state)); return -EINVAL; } /* Check if this device supports the desired state */ if ((state == PCI_D1 && !dev->d1_support) || (state == PCI_D2 && !dev->d2_support)) return -EIO; pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (PCI_POSSIBLE_ERROR(pmcsr)) { pci_err(dev, "Unable to change power state from %s to %s, device inaccessible\n", pci_power_name(dev->current_state), pci_power_name(state)); dev->current_state = PCI_D3cold; return -EIO; } pmcsr &= ~PCI_PM_CTRL_STATE_MASK; pmcsr |= state; /* Enter specified state */ pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr); /* Mandatory power management transition delays; see PCI PM 1.2. */ if (state == PCI_D3hot) pci_dev_d3_sleep(dev); else if (state == PCI_D2) udelay(PCI_PM_D2_DELAY); pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); dev->current_state = pmcsr & PCI_PM_CTRL_STATE_MASK; if (dev->current_state != state) pci_info_ratelimited(dev, "Refused to change power state from %s to %s\n", pci_power_name(dev->current_state), pci_power_name(state)); if (dev->bus->self) pcie_aspm_pm_state_change(dev->bus->self, locked); return 0; } static int __pci_set_power_state(struct pci_dev *dev, pci_power_t state, bool locked) { int error; /* Bound the state we're entering */ if (state > PCI_D3cold) state = PCI_D3cold; else if (state < PCI_D0) state = PCI_D0; else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev)) /* * If the device or the parent bridge do not support PCI * PM, ignore the request if we're doing anything other * than putting it into D0 (which would only happen on * boot). */ return 0; /* Check if we're already there */ if (dev->current_state == state) return 0; if (state == PCI_D0) return pci_set_full_power_state(dev, locked); /* * This device is quirked not to be put into D3, so don't put it in * D3 */ if (state >= PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3)) return 0; if (state == PCI_D3cold) { /* * To put the device in D3cold, put it into D3hot in the native * way, then put it into D3cold using platform ops. */ error = pci_set_low_power_state(dev, PCI_D3hot, locked); if (pci_platform_power_transition(dev, PCI_D3cold)) return error; /* Powering off a bridge may power off the whole hierarchy */ if (dev->current_state == PCI_D3cold) __pci_bus_set_current_state(dev->subordinate, PCI_D3cold, locked); } else { error = pci_set_low_power_state(dev, state, locked); if (pci_platform_power_transition(dev, state)) return error; } return 0; } /** * pci_set_power_state - Set the power state of a PCI device * @dev: PCI device to handle. * @state: PCI power state (D0, D1, D2, D3hot) to put the device into. * * Transition a device to a new power state, using the platform firmware and/or * the device's PCI PM registers. * * RETURN VALUE: * -EINVAL if the requested state is invalid. * -EIO if device does not support PCI PM or its PM capabilities register has a * wrong version, or device doesn't support the requested state. * 0 if the transition is to D1 or D2 but D1 and D2 are not supported. * 0 if device already is in the requested state. * 0 if the transition is to D3 but D3 is not supported. * 0 if device's power state has been successfully changed. */ int pci_set_power_state(struct pci_dev *dev, pci_power_t state) { return __pci_set_power_state(dev, state, false); } EXPORT_SYMBOL(pci_set_power_state); int pci_set_power_state_locked(struct pci_dev *dev, pci_power_t state) { lockdep_assert_held(&pci_bus_sem); return __pci_set_power_state(dev, state, true); } EXPORT_SYMBOL(pci_set_power_state_locked); #define PCI_EXP_SAVE_REGS 7 static struct pci_cap_saved_state *_pci_find_saved_cap(struct pci_dev *pci_dev, u16 cap, bool extended) { struct pci_cap_saved_state *tmp; hlist_for_each_entry(tmp, &pci_dev->saved_cap_space, next) { if (tmp->cap.cap_extended == extended && tmp->cap.cap_nr == cap) return tmp; } return NULL; } struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap) { return _pci_find_saved_cap(dev, cap, false); } struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, u16 cap) { return _pci_find_saved_cap(dev, cap, true); } static int pci_save_pcie_state(struct pci_dev *dev) { int i = 0; struct pci_cap_saved_state *save_state; u16 *cap; if (!pci_is_pcie(dev)) return 0; save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP); if (!save_state) { pci_err(dev, "buffer not found in %s\n", __func__); return -ENOMEM; } cap = (u16 *)&save_state->cap.data[0]; pcie_capability_read_word(dev, PCI_EXP_DEVCTL, &cap[i++]); pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &cap[i++]); pcie_capability_read_word(dev, PCI_EXP_SLTCTL, &cap[i++]); pcie_capability_read_word(dev, PCI_EXP_RTCTL, &cap[i++]); pcie_capability_read_word(dev, PCI_EXP_DEVCTL2, &cap[i++]); pcie_capability_read_word(dev, PCI_EXP_LNKCTL2, &cap[i++]); pcie_capability_read_word(dev, PCI_EXP_SLTCTL2, &cap[i++]); pci_save_aspm_l1ss_state(dev); pci_save_ltr_state(dev); return 0; } static void pci_restore_pcie_state(struct pci_dev *dev) { int i = 0; struct pci_cap_saved_state *save_state; u16 *cap; /* * Restore max latencies (in the LTR capability) before enabling * LTR itself in PCI_EXP_DEVCTL2. */ pci_restore_ltr_state(dev); pci_restore_aspm_l1ss_state(dev); save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP); if (!save_state) return; /* * Downstream ports reset the LTR enable bit when link goes down. * Check and re-configure the bit here before restoring device. * PCIe r5.0, sec 7.5.3.16. */ pci_bridge_reconfigure_ltr(dev); cap = (u16 *)&save_state->cap.data[0]; pcie_capability_write_word(dev, PCI_EXP_DEVCTL, cap[i++]); pcie_capability_write_word(dev, PCI_EXP_LNKCTL, cap[i++]); pcie_capability_write_word(dev, PCI_EXP_SLTCTL, cap[i++]); pcie_capability_write_word(dev, PCI_EXP_RTCTL, cap[i++]); pcie_capability_write_word(dev, PCI_EXP_DEVCTL2, cap[i++]); pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, cap[i++]); pcie_capability_write_word(dev, PCI_EXP_SLTCTL2, cap[i++]); } static int pci_save_pcix_state(struct pci_dev *dev) { int pos; struct pci_cap_saved_state *save_state; pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); if (!pos) return 0; save_state = pci_find_saved_cap(dev, PCI_CAP_ID_PCIX); if (!save_state) { pci_err(dev, "buffer not found in %s\n", __func__); return -ENOMEM; } pci_read_config_word(dev, pos + PCI_X_CMD, (u16 *)save_state->cap.data); return 0; } static void pci_restore_pcix_state(struct pci_dev *dev) { int i = 0, pos; struct pci_cap_saved_state *save_state; u16 *cap; save_state = pci_find_saved_cap(dev, PCI_CAP_ID_PCIX); pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); if (!save_state || !pos) return; cap = (u16 *)&save_state->cap.data[0]; pci_write_config_word(dev, pos + PCI_X_CMD, cap[i++]); } /** * pci_save_state - save the PCI configuration space of a device before * suspending * @dev: PCI device that we're dealing with */ int pci_save_state(struct pci_dev *dev) { int i; /* XXX: 100% dword access ok here? */ for (i = 0; i < 16; i++) { pci_read_config_dword(dev, i * 4, &dev->saved_config_space[i]); pci_dbg(dev, "save config %#04x: %#010x\n", i * 4, dev->saved_config_space[i]); } dev->state_saved = true; i = pci_save_pcie_state(dev); if (i != 0) return i; i = pci_save_pcix_state(dev); if (i != 0) return i; pci_save_dpc_state(dev); pci_save_aer_state(dev); pci_save_ptm_state(dev); pci_save_tph_state(dev); return pci_save_vc_state(dev); } EXPORT_SYMBOL(pci_save_state); static void pci_restore_config_dword(struct pci_dev *pdev, int offset, u32 saved_val, int retry, bool force) { u32 val; pci_read_config_dword(pdev, offset, &val); if (!force && val == saved_val) return; for (;;) { pci_dbg(pdev, "restore config %#04x: %#010x -> %#010x\n", offset, val, saved_val); pci_write_config_dword(pdev, offset, saved_val); if (retry-- <= 0) return; pci_read_config_dword(pdev, offset, &val); if (val == saved_val) return; mdelay(1); } } static void pci_restore_config_space_range(struct pci_dev *pdev, int start, int end, int retry, bool force) { int index; for (index = end; index >= start; index--) pci_restore_config_dword(pdev, 4 * index, pdev->saved_config_space[index], retry, force); } static void pci_restore_config_space(struct pci_dev *pdev) { if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) { pci_restore_config_space_range(pdev, 10, 15, 0, false); /* Restore BARs before the command register. */ pci_restore_config_space_range(pdev, 4, 9, 10, false); pci_restore_config_space_range(pdev, 0, 3, 0, false); } else if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { pci_restore_config_space_range(pdev, 12, 15, 0, false); /* * Force rewriting of prefetch registers to avoid S3 resume * issues on Intel PCI bridges that occur when these * registers are not explicitly written. */ pci_restore_config_space_range(pdev, 9, 11, 0, true); pci_restore_config_space_range(pdev, 0, 8, 0, false); } else { pci_restore_config_space_range(pdev, 0, 15, 0, false); } } static void pci_restore_rebar_state(struct pci_dev *pdev) { unsigned int pos, nbars, i; u32 ctrl; pos = pdev->rebar_cap; if (!pos) return; pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl); for (i = 0; i < nbars; i++, pos += 8) { struct resource *res; int bar_idx, size; pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX; res = pci_resource_n(pdev, bar_idx); size = pci_rebar_bytes_to_size(resource_size(res)); ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE; ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size); pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl); } } /** * pci_restore_state - Restore the saved state of a PCI device * @dev: PCI device that we're dealing with */ void pci_restore_state(struct pci_dev *dev) { if (!dev->state_saved) return; pci_restore_pcie_state(dev); pci_restore_pasid_state(dev); pci_restore_pri_state(dev); pci_restore_ats_state(dev); pci_restore_vc_state(dev); pci_restore_rebar_state(dev); pci_restore_dpc_state(dev); pci_restore_ptm_state(dev); pci_restore_tph_state(dev); pci_aer_clear_status(dev); pci_restore_aer_state(dev); pci_restore_config_space(dev); pci_restore_pcix_state(dev); pci_restore_msi_state(dev); /* Restore ACS and IOV configuration state */ pci_enable_acs(dev); pci_restore_iov_state(dev); dev->state_saved = false; } EXPORT_SYMBOL(pci_restore_state); struct pci_saved_state { u32 config_space[16]; struct pci_cap_saved_data cap[]; }; /** * pci_store_saved_state - Allocate and return an opaque struct containing * the device saved state. * @dev: PCI device that we're dealing with * * Return NULL if no state or error. */ struct pci_saved_state *pci_store_saved_state(struct pci_dev *dev) { struct pci_saved_state *state; struct pci_cap_saved_state *tmp; struct pci_cap_saved_data *cap; size_t size; if (!dev->state_saved) return NULL; size = sizeof(*state) + sizeof(struct pci_cap_saved_data); hlist_for_each_entry(tmp, &dev->saved_cap_space, next) size += sizeof(struct pci_cap_saved_data) + tmp->cap.size; state = kzalloc(size, GFP_KERNEL); if (!state) return NULL; memcpy(state->config_space, dev->saved_config_space, sizeof(state->config_space)); cap = state->cap; hlist_for_each_entry(tmp, &dev->saved_cap_space, next) { size_t len = sizeof(struct pci_cap_saved_data) + tmp->cap.size; memcpy(cap, &tmp->cap, len); cap = (struct pci_cap_saved_data *)((u8 *)cap + len); } /* Empty cap_save terminates list */ return state; } EXPORT_SYMBOL_GPL(pci_store_saved_state); /** * pci_load_saved_state - Reload the provided save state into struct pci_dev. * @dev: PCI device that we're dealing with * @state: Saved state returned from pci_store_saved_state() */ int pci_load_saved_state(struct pci_dev *dev, struct pci_saved_state *state) { struct pci_cap_saved_data *cap; dev->state_saved = false; if (!state) return 0; memcpy(dev->saved_config_space, state->config_space, sizeof(state->config_space)); cap = state->cap; while (cap->size) { struct pci_cap_saved_state *tmp; tmp = _pci_find_saved_cap(dev, cap->cap_nr, cap->cap_extended); if (!tmp || tmp->cap.size != cap->size) return -EINVAL; memcpy(tmp->cap.data, cap->data, tmp->cap.size); cap = (struct pci_cap_saved_data *)((u8 *)cap + sizeof(struct pci_cap_saved_data) + cap->size); } dev->state_saved = true; return 0; } EXPORT_SYMBOL_GPL(pci_load_saved_state); /** * pci_load_and_free_saved_state - Reload the save state pointed to by state, * and free the memory allocated for it. * @dev: PCI device that we're dealing with * @state: Pointer to saved state returned from pci_store_saved_state() */ int pci_load_and_free_saved_state(struct pci_dev *dev, struct pci_saved_state **state) { int ret = pci_load_saved_state(dev, *state); kfree(*state); *state = NULL; return ret; } EXPORT_SYMBOL_GPL(pci_load_and_free_saved_state); int __weak pcibios_enable_device(struct pci_dev *dev, int bars) { return pci_enable_resources(dev, bars); } static int pci_host_bridge_enable_device(struct pci_dev *dev) { struct pci_host_bridge *host_bridge = pci_find_host_bridge(dev->bus); int err; if (host_bridge && host_bridge->enable_device) { err = host_bridge->enable_device(host_bridge, dev); if (err) return err; } return 0; } static void pci_host_bridge_disable_device(struct pci_dev *dev) { struct pci_host_bridge *host_bridge = pci_find_host_bridge(dev->bus); if (host_bridge && host_bridge->disable_device) host_bridge->disable_device(host_bridge, dev); } static int do_pci_enable_device(struct pci_dev *dev, int bars) { int err; struct pci_dev *bridge; u16 cmd; u8 pin; err = pci_set_power_state(dev, PCI_D0); if (err < 0 && err != -EIO) return err; bridge = pci_upstream_bridge(dev); if (bridge) pcie_aspm_powersave_config_link(bridge); err = pci_host_bridge_enable_device(dev); if (err) return err; err = pcibios_enable_device(dev, bars); if (err < 0) goto err_enable; pci_fixup_device(pci_fixup_enable, dev); if (dev->msi_enabled || dev->msix_enabled) return 0; pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (pin) { pci_read_config_word(dev, PCI_COMMAND, &cmd); if (cmd & PCI_COMMAND_INTX_DISABLE) pci_write_config_word(dev, PCI_COMMAND, cmd & ~PCI_COMMAND_INTX_DISABLE); } return 0; err_enable: pci_host_bridge_disable_device(dev); return err; } /** * pci_reenable_device - Resume abandoned device * @dev: PCI device to be resumed * * NOTE: This function is a backend of pci_default_resume() and is not supposed * to be called by normal code, write proper resume handler and use it instead. */ int pci_reenable_device(struct pci_dev *dev) { if (pci_is_enabled(dev)) return do_pci_enable_device(dev, (1 << PCI_NUM_RESOURCES) - 1); return 0; } EXPORT_SYMBOL(pci_reenable_device); static void pci_enable_bridge(struct pci_dev *dev) { struct pci_dev *bridge; int retval; bridge = pci_upstream_bridge(dev); if (bridge) pci_enable_bridge(bridge); if (pci_is_enabled(dev)) { if (!dev->is_busmaster) pci_set_master(dev); return; } retval = pci_enable_device(dev); if (retval) pci_err(dev, "Error enabling bridge (%d), continuing\n", retval); pci_set_master(dev); } static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags) { struct pci_dev *bridge; int err; int i, bars = 0; /* * Power state could be unknown at this point, either due to a fresh * boot or a device removal call. So get the current power state * so that things like MSI message writing will behave as expected * (e.g. if the device really is in D0 at enable time). */ pci_update_current_state(dev, dev->current_state); if (atomic_inc_return(&dev->enable_cnt) > 1) return 0; /* already enabled */ bridge = pci_upstream_bridge(dev); if (bridge) pci_enable_bridge(bridge); /* only skip sriov related */ for (i = 0; i <= PCI_ROM_RESOURCE; i++) if (dev->resource[i].flags & flags) bars |= (1 << i); for (i = PCI_BRIDGE_RESOURCES; i < DEVICE_COUNT_RESOURCE; i++) if (dev->resource[i].flags & flags) bars |= (1 << i); err = do_pci_enable_device(dev, bars); if (err < 0) atomic_dec(&dev->enable_cnt); return err; } /** * pci_enable_device_mem - Initialize a device for use with Memory space * @dev: PCI device to be initialized * * Initialize device before it's used by a driver. Ask low-level code * to enable Memory resources. Wake up the device if it was suspended. * Beware, this function can fail. */ int pci_enable_device_mem(struct pci_dev *dev) { return pci_enable_device_flags(dev, IORESOURCE_MEM); } EXPORT_SYMBOL(pci_enable_device_mem); /** * pci_enable_device - Initialize device before it's used by a driver. * @dev: PCI device to be initialized * * Initialize device before it's used by a driver. Ask low-level code * to enable I/O and memory. Wake up the device if it was suspended. * Beware, this function can fail. * * Note we don't actually enable the device many times if we call * this function repeatedly (we just increment the count). */ int pci_enable_device(struct pci_dev *dev) { return pci_enable_device_flags(dev, IORESOURCE_MEM | IORESOURCE_IO); } EXPORT_SYMBOL(pci_enable_device); /* * pcibios_device_add - provide arch specific hooks when adding device dev * @dev: the PCI device being added * * Permits the platform to provide architecture specific functionality when * devices are added. This is the default implementation. Architecture * implementations can override this. */ int __weak pcibios_device_add(struct pci_dev *dev) { return 0; } /** * pcibios_release_device - provide arch specific hooks when releasing * device dev * @dev: the PCI device being released * * Permits the platform to provide architecture specific functionality when * devices are released. This is the default implementation. Architecture * implementations can override this. */ void __weak pcibios_release_device(struct pci_dev *dev) {} /** * pcibios_disable_device - disable arch specific PCI resources for device dev * @dev: the PCI device to disable * * Disables architecture specific PCI resources for the device. This * is the default implementation. Architecture implementations can * override this. */ void __weak pcibios_disable_device(struct pci_dev *dev) {} static void do_pci_disable_device(struct pci_dev *dev) { u16 pci_command; pci_read_config_word(dev, PCI_COMMAND, &pci_command); if (pci_command & PCI_COMMAND_MASTER) { pci_command &= ~PCI_COMMAND_MASTER; pci_write_config_word(dev, PCI_COMMAND, pci_command); } pcibios_disable_device(dev); } /** * pci_disable_enabled_device - Disable device without updating enable_cnt * @dev: PCI device to disable * * NOTE: This function is a backend of PCI power management routines and is * not supposed to be called drivers. */ void pci_disable_enabled_device(struct pci_dev *dev) { if (pci_is_enabled(dev)) do_pci_disable_device(dev); } /** * pci_disable_device - Disable PCI device after use * @dev: PCI device to be disabled * * Signal to the system that the PCI device is not in use by the system * anymore. This only involves disabling PCI bus-mastering, if active. * * Note we don't actually disable the device until all callers of * pci_enable_device() have called pci_disable_device(). */ void pci_disable_device(struct pci_dev *dev) { dev_WARN_ONCE(&dev->dev, atomic_read(&dev->enable_cnt) <= 0, "disabling already-disabled device"); if (atomic_dec_return(&dev->enable_cnt) != 0) return; pci_host_bridge_disable_device(dev); do_pci_disable_device(dev); dev->is_busmaster = 0; } EXPORT_SYMBOL(pci_disable_device); /** * pcibios_set_pcie_reset_state - set reset state for device dev * @dev: the PCIe device reset * @state: Reset state to enter into * * Set the PCIe reset state for the device. This is the default * implementation. Architecture implementations can override this. */ int __weak pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) { return -EINVAL; } /** * pci_set_pcie_reset_state - set reset state for device dev * @dev: the PCIe device reset * @state: Reset state to enter into * * Sets the PCI reset state for the device. */ int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) { return pcibios_set_pcie_reset_state(dev, state); } EXPORT_SYMBOL_GPL(pci_set_pcie_reset_state); #ifdef CONFIG_PCIEAER void pcie_clear_device_status(struct pci_dev *dev) { u16 sta; pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta); pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta); } #endif /** * pcie_clear_root_pme_status - Clear root port PME interrupt status. * @dev: PCIe root port or event collector. */ void pcie_clear_root_pme_status(struct pci_dev *dev) { pcie_capability_set_dword(dev, PCI_EXP_RTSTA, PCI_EXP_RTSTA_PME); } /** * pci_check_pme_status - Check if given device has generated PME. * @dev: Device to check. * * Check the PME status of the device and if set, clear it and clear PME enable * (if set). Return 'true' if PME status and PME enable were both set or * 'false' otherwise. */ bool pci_check_pme_status(struct pci_dev *dev) { int pmcsr_pos; u16 pmcsr; bool ret = false; if (!dev->pm_cap) return false; pmcsr_pos = dev->pm_cap + PCI_PM_CTRL; pci_read_config_word(dev, pmcsr_pos, &pmcsr); if (!(pmcsr & PCI_PM_CTRL_PME_STATUS)) return false; /* Clear PME status. */ pmcsr |= PCI_PM_CTRL_PME_STATUS; if (pmcsr & PCI_PM_CTRL_PME_ENABLE) { /* Disable PME to avoid interrupt flood. */ pmcsr &= ~PCI_PM_CTRL_PME_ENABLE; ret = true; } pci_write_config_word(dev, pmcsr_pos, pmcsr); return ret; } /** * pci_pme_wakeup - Wake up a PCI device if its PME Status bit is set. * @dev: Device to handle. * @pme_poll_reset: Whether or not to reset the device's pme_poll flag. * * Check if @dev has generated PME and queue a resume request for it in that * case. */ static int pci_pme_wakeup(struct pci_dev *dev, void *pme_poll_reset) { if (pme_poll_reset && dev->pme_poll) dev->pme_poll = false; if (pci_check_pme_status(dev)) { pci_wakeup_event(dev); pm_request_resume(&dev->dev); } return 0; } /** * pci_pme_wakeup_bus - Walk given bus and wake up devices on it, if necessary. * @bus: Top bus of the subtree to walk. */ void pci_pme_wakeup_bus(struct pci_bus *bus) { if (bus) pci_walk_bus(bus, pci_pme_wakeup, (void *)true); } /** * pci_pme_capable - check the capability of PCI device to generate PME# * @dev: PCI device to handle. * @state: PCI state from which device will issue PME#. */ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state) { if (!dev->pm_cap) return false; return !!(dev->pme_support & (1 << state)); } EXPORT_SYMBOL(pci_pme_capable); static void pci_pme_list_scan(struct work_struct *work) { struct pci_pme_device *pme_dev, *n; mutex_lock(&pci_pme_list_mutex); list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) { struct pci_dev *pdev = pme_dev->dev; if (pdev->pme_poll) { struct pci_dev *bridge = pdev->bus->self; struct device *dev = &pdev->dev; struct device *bdev = bridge ? &bridge->dev : NULL; int bref = 0; /* * If we have a bridge, it should be in an active/D0 * state or the configuration space of subordinate * devices may not be accessible or stable over the * course of the call. */ if (bdev) { bref = pm_runtime_get_if_active(bdev); if (!bref) continue; if (bridge->current_state != PCI_D0) goto put_bridge; } /* * The device itself should be suspended but config * space must be accessible, therefore it cannot be in * D3cold. */ if (pm_runtime_suspended(dev) && pdev->current_state != PCI_D3cold) pci_pme_wakeup(pdev, NULL); put_bridge: if (bref > 0) pm_runtime_put(bdev); } else { list_del(&pme_dev->list); kfree(pme_dev); } } if (!list_empty(&pci_pme_list)) queue_delayed_work(system_freezable_wq, &pci_pme_work, msecs_to_jiffies(PME_TIMEOUT)); mutex_unlock(&pci_pme_list_mutex); } static void __pci_pme_active(struct pci_dev *dev, bool enable) { u16 pmcsr; if (!dev->pme_support) return; pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); /* Clear PME_Status by writing 1 to it and enable PME# */ pmcsr |= PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE; if (!enable) pmcsr &= ~PCI_PM_CTRL_PME_ENABLE; pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr); } /** * pci_pme_restore - Restore PME configuration after config space restore. * @dev: PCI device to update. */ void pci_pme_restore(struct pci_dev *dev) { u16 pmcsr; if (!dev->pme_support) return; pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (dev->wakeup_prepared) { pmcsr |= PCI_PM_CTRL_PME_ENABLE; pmcsr &= ~PCI_PM_CTRL_PME_STATUS; } else { pmcsr &= ~PCI_PM_CTRL_PME_ENABLE; pmcsr |= PCI_PM_CTRL_PME_STATUS; } pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr); } /** * pci_pme_active - enable or disable PCI device's PME# function * @dev: PCI device to handle. * @enable: 'true' to enable PME# generation; 'false' to disable it. * * The caller must verify that the device is capable of generating PME# before * calling this function with @enable equal to 'true'. */ void pci_pme_active(struct pci_dev *dev, bool enable) { __pci_pme_active(dev, enable); /* * PCI (as opposed to PCIe) PME requires that the device have * its PME# line hooked up correctly. Not all hardware vendors * do this, so the PME never gets delivered and the device * remains asleep. The easiest way around this is to * periodically walk the list of suspended devices and check * whether any have their PME flag set. The assumption is that * we'll wake up often enough anyway that this won't be a huge * hit, and the power savings from the devices will still be a * win. * * Although PCIe uses in-band PME message instead of PME# line * to report PME, PME does not work for some PCIe devices in * reality. For example, there are devices that set their PME * status bits, but don't really bother to send a PME message; * there are PCI Express Root Ports that don't bother to * trigger interrupts when they receive PME messages from the * devices below. So PME poll is used for PCIe devices too. */ if (dev->pme_poll) { struct pci_pme_device *pme_dev; if (enable) { pme_dev = kmalloc(sizeof(struct pci_pme_device), GFP_KERNEL); if (!pme_dev) { pci_warn(dev, "can't enable PME#\n"); return; } pme_dev->dev = dev; mutex_lock(&pci_pme_list_mutex); list_add(&pme_dev->list, &pci_pme_list); if (list_is_singular(&pci_pme_list)) queue_delayed_work(system_freezable_wq, &pci_pme_work, msecs_to_jiffies(PME_TIMEOUT)); mutex_unlock(&pci_pme_list_mutex); } else { mutex_lock(&pci_pme_list_mutex); list_for_each_entry(pme_dev, &pci_pme_list, list) { if (pme_dev->dev == dev) { list_del(&pme_dev->list); kfree(pme_dev); break; } } mutex_unlock(&pci_pme_list_mutex); } } pci_dbg(dev, "PME# %s\n", enable ? "enabled" : "disabled"); } EXPORT_SYMBOL(pci_pme_active); /** * __pci_enable_wake - enable PCI device as wakeup event source * @dev: PCI device affected * @state: PCI state from which device will issue wakeup events * @enable: True to enable event generation; false to disable * * This enables the device as a wakeup event source, or disables it. * When such events involves platform-specific hooks, those hooks are * called automatically by this routine. * * Devices with legacy power management (no standard PCI PM capabilities) * always require such platform hooks. * * RETURN VALUE: * 0 is returned on success * -EINVAL is returned if device is not supposed to wake up the system * Error code depending on the platform is returned if both the platform and * the native mechanism fail to enable the generation of wake-up events */ static int __pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable) { int ret = 0; /* * Bridges that are not power-manageable directly only signal * wakeup on behalf of subordinate devices which is set up * elsewhere, so skip them. However, bridges that are * power-manageable may signal wakeup for themselves (for example, * on a hotplug event) and they need to be covered here. */ if (!pci_power_manageable(dev)) return 0; /* Don't do the same thing twice in a row for one device. */ if (!!enable == !!dev->wakeup_prepared) return 0; /* * According to "PCI System Architecture" 4th ed. by Tom Shanley & Don * Anderson we should be doing PME# wake enable followed by ACPI wake * enable. To disable wake-up we call the platform first, for symmetry. */ if (enable) { int error; /* * Enable PME signaling if the device can signal PME from * D3cold regardless of whether or not it can signal PME from * the current target state, because that will allow it to * signal PME when the hierarchy above it goes into D3cold and * the device itself ends up in D3cold as a result of that. */ if (pci_pme_capable(dev, state) || pci_pme_capable(dev, PCI_D3cold)) pci_pme_active(dev, true); else ret = 1; error = platform_pci_set_wakeup(dev, true); if (ret) ret = error; if (!ret) dev->wakeup_prepared = true; } else { platform_pci_set_wakeup(dev, false); pci_pme_active(dev, false); dev->wakeup_prepared = false; } return ret; } /** * pci_enable_wake - change wakeup settings for a PCI device * @pci_dev: Target device * @state: PCI state from which device will issue wakeup events * @enable: Whether or not to enable event generation * * If @enable is set, check device_may_wakeup() for the device before calling * __pci_enable_wake() for it. */ int pci_enable_wake(struct pci_dev *pci_dev, pci_power_t state, bool enable) { if (enable && !device_may_wakeup(&pci_dev->dev)) return -EINVAL; return __pci_enable_wake(pci_dev, state, enable); } EXPORT_SYMBOL(pci_enable_wake); /** * pci_wake_from_d3 - enable/disable device to wake up from D3_hot or D3_cold * @dev: PCI device to prepare * @enable: True to enable wake-up event generation; false to disable * * Many drivers want the device to wake up the system from D3_hot or D3_cold * and this function allows them to set that up cleanly - pci_enable_wake() * should not be called twice in a row to enable wake-up due to PCI PM vs ACPI * ordering constraints. * * This function only returns error code if the device is not allowed to wake * up the system from sleep or it is not capable of generating PME# from both * D3_hot and D3_cold and the platform is unable to enable wake-up power for it. */ int pci_wake_from_d3(struct pci_dev *dev, bool enable) { return pci_pme_capable(dev, PCI_D3cold) ? pci_enable_wake(dev, PCI_D3cold, enable) : pci_enable_wake(dev, PCI_D3hot, enable); } EXPORT_SYMBOL(pci_wake_from_d3); /** * pci_target_state - find an appropriate low power state for a given PCI dev * @dev: PCI device * @wakeup: Whether or not wakeup functionality will be enabled for the device. * * Use underlying platform code to find a supported low power state for @dev. * If the platform can't manage @dev, return the deepest state from which it * can generate wake events, based on any available PME info. */ static pci_power_t pci_target_state(struct pci_dev *dev, bool wakeup) { if (platform_pci_power_manageable(dev)) { /* * Call the platform to find the target state for the device. */ pci_power_t state = platform_pci_choose_state(dev); switch (state) { case PCI_POWER_ERROR: case PCI_UNKNOWN: return PCI_D3hot; case PCI_D1: case PCI_D2: if (pci_no_d1d2(dev)) return PCI_D3hot; } return state; } /* * If the device is in D3cold even though it's not power-manageable by * the platform, it may have been powered down by non-standard means. * Best to let it slumber. */ if (dev->current_state == PCI_D3cold) return PCI_D3cold; else if (!dev->pm_cap) return PCI_D0; if (wakeup && dev->pme_support) { pci_power_t state = PCI_D3hot; /* * Find the deepest state from which the device can generate * PME#. */ while (state && !(dev->pme_support & (1 << state))) state--; if (state) return state; else if (dev->pme_support & 1) return PCI_D0; } return PCI_D3hot; } /** * pci_prepare_to_sleep - prepare PCI device for system-wide transition * into a sleep state * @dev: Device to handle. * * Choose the power state appropriate for the device depending on whether * it can wake up the system and/or is power manageable by the platform * (PCI_D3hot is the default) and put the device into that state. */ int pci_prepare_to_sleep(struct pci_dev *dev) { bool wakeup = device_may_wakeup(&dev->dev); pci_power_t target_state = pci_target_state(dev, wakeup); int error; if (target_state == PCI_POWER_ERROR) return -EIO; pci_enable_wake(dev, target_state, wakeup); error = pci_set_power_state(dev, target_state); if (error) pci_enable_wake(dev, target_state, false); return error; } EXPORT_SYMBOL(pci_prepare_to_sleep); /** * pci_back_from_sleep - turn PCI device on during system-wide transition * into working state * @dev: Device to handle. * * Disable device's system wake-up capability and put it into D0. */ int pci_back_from_sleep(struct pci_dev *dev) { int ret = pci_set_power_state(dev, PCI_D0); if (ret) return ret; pci_enable_wake(dev, PCI_D0, false); return 0; } EXPORT_SYMBOL(pci_back_from_sleep); /** * pci_finish_runtime_suspend - Carry out PCI-specific part of runtime suspend. * @dev: PCI device being suspended. * * Prepare @dev to generate wake-up events at run time and put it into a low * power state. */ int pci_finish_runtime_suspend(struct pci_dev *dev) { pci_power_t target_state; int error; target_state = pci_target_state(dev, device_can_wakeup(&dev->dev)); if (target_state == PCI_POWER_ERROR) return -EIO; __pci_enable_wake(dev, target_state, pci_dev_run_wake(dev)); error = pci_set_power_state(dev, target_state); if (error) pci_enable_wake(dev, target_state, false); return error; } /** * pci_dev_run_wake - Check if device can generate run-time wake-up events. * @dev: Device to check. * * Return true if the device itself is capable of generating wake-up events * (through the platform or using the native PCIe PME) or if the device supports * PME and one of its upstream bridges can generate wake-up events. */ bool pci_dev_run_wake(struct pci_dev *dev) { struct pci_bus *bus = dev->bus; if (!dev->pme_support) return false; /* PME-capable in principle, but not from the target power state */ if (!pci_pme_capable(dev, pci_target_state(dev, true))) return false; if (device_can_wakeup(&dev->dev)) return true; while (bus->parent) { struct pci_dev *bridge = bus->self; if (device_can_wakeup(&bridge->dev)) return true; bus = bus->parent; } /* We have reached the root bus. */ if (bus->bridge) return device_can_wakeup(bus->bridge); return false; } EXPORT_SYMBOL_GPL(pci_dev_run_wake); /** * pci_dev_need_resume - Check if it is necessary to resume the device. * @pci_dev: Device to check. * * Return 'true' if the device is not runtime-suspended or it has to be * reconfigured due to wakeup settings difference between system and runtime * suspend, or the current power state of it is not suitable for the upcoming * (system-wide) transition. */ bool pci_dev_need_resume(struct pci_dev *pci_dev) { struct device *dev = &pci_dev->dev; pci_power_t target_state; if (!pm_runtime_suspended(dev) || platform_pci_need_resume(pci_dev)) return true; target_state = pci_target_state(pci_dev, device_may_wakeup(dev)); /* * If the earlier platform check has not triggered, D3cold is just power * removal on top of D3hot, so no need to resume the device in that * case. */ return target_state != pci_dev->current_state && target_state != PCI_D3cold && pci_dev->current_state != PCI_D3hot; } /** * pci_dev_adjust_pme - Adjust PME setting for a suspended device. * @pci_dev: Device to check. * * If the device is suspended and it is not configured for system wakeup, * disable PME for it to prevent it from waking up the system unnecessarily. * * Note that if the device's power state is D3cold and the platform check in * pci_dev_need_resume() has not triggered, the device's configuration need not * be changed. */ void pci_dev_adjust_pme(struct pci_dev *pci_dev) { struct device *dev = &pci_dev->dev; spin_lock_irq(&dev->power.lock); if (pm_runtime_suspended(dev) && !device_may_wakeup(dev) && pci_dev->current_state < PCI_D3cold) __pci_pme_active(pci_dev, false); spin_unlock_irq(&dev->power.lock); } /** * pci_dev_complete_resume - Finalize resume from system sleep for a device. * @pci_dev: Device to handle. * * If the device is runtime suspended and wakeup-capable, enable PME for it as * it might have been disabled during the prepare phase of system suspend if * the device was not configured for system wakeup. */ void pci_dev_complete_resume(struct pci_dev *pci_dev) { struct device *dev = &pci_dev->dev; if (!pci_dev_run_wake(pci_dev)) return; spin_lock_irq(&dev->power.lock); if (pm_runtime_suspended(dev) && pci_dev->current_state < PCI_D3cold) __pci_pme_active(pci_dev, true); spin_unlock_irq(&dev->power.lock); } /** * pci_choose_state - Choose the power state of a PCI device. * @dev: Target PCI device. * @state: Target state for the whole system. * * Returns PCI power state suitable for @dev and @state. */ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) { if (state.event == PM_EVENT_ON) return PCI_D0; return pci_target_state(dev, false); } EXPORT_SYMBOL(pci_choose_state); void pci_config_pm_runtime_get(struct pci_dev *pdev) { struct device *dev = &pdev->dev; struct device *parent = dev->parent; if (parent) pm_runtime_get_sync(parent); pm_runtime_get_noresume(dev); /* * pdev->current_state is set to PCI_D3cold during suspending, * so wait until suspending completes */ pm_runtime_barrier(dev); /* * Only need to resume devices in D3cold, because config * registers are still accessible for devices suspended but * not in D3cold. */ if (pdev->current_state == PCI_D3cold) pm_runtime_resume(dev); } void pci_config_pm_runtime_put(struct pci_dev *pdev) { struct device *dev = &pdev->dev; struct device *parent = dev->parent; pm_runtime_put(dev); if (parent) pm_runtime_put_sync(parent); } static const struct dmi_system_id bridge_d3_blacklist[] = { #ifdef CONFIG_X86 { /* * Gigabyte X299 root port is not marked as hotplug capable * which allows Linux to power manage it. However, this * confuses the BIOS SMI handler so don't power manage root * ports on that system. */ .ident = "X299 DESIGNARE EX-CF", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Gigabyte Technology Co., Ltd."), DMI_MATCH(DMI_BOARD_NAME, "X299 DESIGNARE EX-CF"), }, }, { /* * Downstream device is not accessible after putting a root port * into D3cold and back into D0 on Elo Continental Z2 board */ .ident = "Elo Continental Z2", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Elo Touch Solutions"), DMI_MATCH(DMI_BOARD_NAME, "Geminilake"), DMI_MATCH(DMI_BOARD_VERSION, "Continental Z2"), }, }, { /* * Changing power state of root port dGPU is connected fails * https://gitlab.freedesktop.org/drm/amd/-/issues/3229 */ .ident = "Hewlett-Packard HP Pavilion 17 Notebook PC/1972", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), DMI_MATCH(DMI_BOARD_NAME, "1972"), DMI_MATCH(DMI_BOARD_VERSION, "95.33"), }, }, #endif { } }; /** * pci_bridge_d3_possible - Is it possible to put the bridge into D3 * @bridge: Bridge to check * * Currently we only allow D3 for some PCIe ports and for Thunderbolt. * * Return: Whether it is possible to move the bridge to D3. * * The return value is guaranteed to be constant across the entire lifetime * of the bridge, including its hot-removal. */ bool pci_bridge_d3_possible(struct pci_dev *bridge) { if (!pci_is_pcie(bridge)) return false; switch (pci_pcie_type(bridge)) { case PCI_EXP_TYPE_ROOT_PORT: case PCI_EXP_TYPE_UPSTREAM: case PCI_EXP_TYPE_DOWNSTREAM: if (pci_bridge_d3_disable) return false; /* * Hotplug ports handled by platform firmware may not be put * into D3 by the OS, e.g. ACPI slots ... */ if (bridge->is_hotplug_bridge && !bridge->is_pciehp) return false; /* ... or PCIe hotplug ports not handled natively by the OS. */ if (bridge->is_pciehp && !pciehp_is_native(bridge)) return false; if (pci_bridge_d3_force) return true; /* Even the oldest 2010 Thunderbolt controller supports D3. */ if (bridge->is_thunderbolt) return true; /* Platform might know better if the bridge supports D3 */ if (platform_pci_bridge_d3(bridge)) return true; /* * Hotplug ports handled natively by the OS were not validated * by vendors for runtime D3 at least until 2018 because there * was no OS support. */ if (bridge->is_pciehp) return false; if (dmi_check_system(bridge_d3_blacklist)) return false; /* * Out of caution, we only allow PCIe ports from 2015 or newer * into D3 on x86. */ if (!IS_ENABLED(CONFIG_X86) || dmi_get_bios_year() >= 2015) return true; break; } return false; } static int pci_dev_check_d3cold(struct pci_dev *dev, void *data) { bool *d3cold_ok = data; if (/* The device needs to be allowed to go D3cold ... */ dev->no_d3cold || !dev->d3cold_allowed || /* ... and if it is wakeup capable to do so from D3cold. */ (device_may_wakeup(&dev->dev) && !pci_pme_capable(dev, PCI_D3cold)) || /* If it is a bridge it must be allowed to go to D3. */ !pci_power_manageable(dev)) *d3cold_ok = false; return !*d3cold_ok; } /* * pci_bridge_d3_update - Update bridge D3 capabilities * @dev: PCI device which is changed * * Update upstream bridge PM capabilities accordingly depending on if the * device PM configuration was changed or the device is being removed. The * change is also propagated upstream. */ void pci_bridge_d3_update(struct pci_dev *dev) { bool remove = !device_is_registered(&dev->dev); struct pci_dev *bridge; bool d3cold_ok = true; bridge = pci_upstream_bridge(dev); if (!bridge || !pci_bridge_d3_possible(bridge)) return; /* * If D3 is currently allowed for the bridge, removing one of its * children won't change that. */ if (remove && bridge->bridge_d3) return; /* * If D3 is currently allowed for the bridge and a child is added or * changed, disallowance of D3 can only be caused by that child, so * we only need to check that single device, not any of its siblings. * * If D3 is currently not allowed for the bridge, checking the device * first may allow us to skip checking its siblings. */ if (!remove) pci_dev_check_d3cold(dev, &d3cold_ok); /* * If D3 is currently not allowed for the bridge, this may be caused * either by the device being changed/removed or any of its siblings, * so we need to go through all children to find out if one of them * continues to block D3. */ if (d3cold_ok && !bridge->bridge_d3) pci_walk_bus(bridge->subordinate, pci_dev_check_d3cold, &d3cold_ok); if (bridge->bridge_d3 != d3cold_ok) { bridge->bridge_d3 = d3cold_ok; /* Propagate change to upstream bridges */ pci_bridge_d3_update(bridge); } } /** * pci_d3cold_enable - Enable D3cold for device * @dev: PCI device to handle * * This function can be used in drivers to enable D3cold from the device * they handle. It also updates upstream PCI bridge PM capabilities * accordingly. */ void pci_d3cold_enable(struct pci_dev *dev) { if (dev->no_d3cold) { dev->no_d3cold = false; pci_bridge_d3_update(dev); } } EXPORT_SYMBOL_GPL(pci_d3cold_enable); /** * pci_d3cold_disable - Disable D3cold for device * @dev: PCI device to handle * * This function can be used in drivers to disable D3cold from the device * they handle. It also updates upstream PCI bridge PM capabilities * accordingly. */ void pci_d3cold_disable(struct pci_dev *dev) { if (!dev->no_d3cold) { dev->no_d3cold = true; pci_bridge_d3_update(dev); } } EXPORT_SYMBOL_GPL(pci_d3cold_disable); void pci_pm_power_up_and_verify_state(struct pci_dev *pci_dev) { pci_power_up(pci_dev); pci_update_current_state(pci_dev, PCI_D0); } /** * pci_pm_init - Initialize PM functions of given PCI device * @dev: PCI device to handle. */ void pci_pm_init(struct pci_dev *dev) { int pm; u16 pmc; device_enable_async_suspend(&dev->dev); dev->wakeup_prepared = false; dev->pm_cap = 0; dev->pme_support = 0; /* find PCI PM capability in list */ pm = pci_find_capability(dev, PCI_CAP_ID_PM); if (!pm) goto poweron; /* Check device's ability to generate PME# */ pci_read_config_word(dev, pm + PCI_PM_PMC, &pmc); if ((pmc & PCI_PM_CAP_VER_MASK) > 3) { pci_err(dev, "unsupported PM cap regs version (%u)\n", pmc & PCI_PM_CAP_VER_MASK); goto poweron; } dev->pm_cap = pm; dev->d3hot_delay = PCI_PM_D3HOT_WAIT; dev->d3cold_delay = PCI_PM_D3COLD_WAIT; dev->bridge_d3 = pci_bridge_d3_possible(dev); dev->d3cold_allowed = true; dev->d1_support = false; dev->d2_support = false; if (!pci_no_d1d2(dev)) { if (pmc & PCI_PM_CAP_D1) dev->d1_support = true; if (pmc & PCI_PM_CAP_D2) dev->d2_support = true; if (dev->d1_support || dev->d2_support) pci_info(dev, "supports%s%s\n", dev->d1_support ? " D1" : "", dev->d2_support ? " D2" : ""); } pmc &= PCI_PM_CAP_PME_MASK; if (pmc) { pci_info(dev, "PME# supported from%s%s%s%s%s\n", (pmc & PCI_PM_CAP_PME_D0) ? " D0" : "", (pmc & PCI_PM_CAP_PME_D1) ? " D1" : "", (pmc & PCI_PM_CAP_PME_D2) ? " D2" : "", (pmc & PCI_PM_CAP_PME_D3hot) ? " D3hot" : "", (pmc & PCI_PM_CAP_PME_D3cold) ? " D3cold" : ""); dev->pme_support = FIELD_GET(PCI_PM_CAP_PME_MASK, pmc); dev->pme_poll = true; /* * Make device's PM flags reflect the wake-up capability, but * let the user space enable it to wake up the system as needed. */ device_set_wakeup_capable(&dev->dev, true); /* Disable the PME# generation functionality */ pci_pme_active(dev, false); } poweron: pci_pm_power_up_and_verify_state(dev); pm_runtime_forbid(&dev->dev); pm_runtime_set_active(&dev->dev); pm_runtime_enable(&dev->dev); } static unsigned long pci_ea_flags(struct pci_dev *dev, u8 prop) { unsigned long flags = IORESOURCE_PCI_FIXED | IORESOURCE_PCI_EA_BEI; switch (prop) { case PCI_EA_P_MEM: case PCI_EA_P_VF_MEM: flags |= IORESOURCE_MEM; break; case PCI_EA_P_MEM_PREFETCH: case PCI_EA_P_VF_MEM_PREFETCH: flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH; break; case PCI_EA_P_IO: flags |= IORESOURCE_IO; break; default: return 0; } return flags; } static struct resource *pci_ea_get_resource(struct pci_dev *dev, u8 bei, u8 prop) { if (bei <= PCI_EA_BEI_BAR5 && prop <= PCI_EA_P_IO) return &dev->resource[bei]; #ifdef CONFIG_PCI_IOV else if (bei >= PCI_EA_BEI_VF_BAR0 && bei <= PCI_EA_BEI_VF_BAR5 && (prop == PCI_EA_P_VF_MEM || prop == PCI_EA_P_VF_MEM_PREFETCH)) return &dev->resource[PCI_IOV_RESOURCES + bei - PCI_EA_BEI_VF_BAR0]; #endif else if (bei == PCI_EA_BEI_ROM) return &dev->resource[PCI_ROM_RESOURCE]; else return NULL; } /* Read an Enhanced Allocation (EA) entry */ static int pci_ea_read(struct pci_dev *dev, int offset) { struct resource *res; const char *res_name; int ent_size, ent_offset = offset; resource_size_t start, end; unsigned long flags; u32 dw0, bei, base, max_offset; u8 prop; bool support_64 = (sizeof(resource_size_t) >= 8); pci_read_config_dword(dev, ent_offset, &dw0); ent_offset += 4; /* Entry size field indicates DWORDs after 1st */ ent_size = (FIELD_GET(PCI_EA_ES, dw0) + 1) << 2; if (!(dw0 & PCI_EA_ENABLE)) /* Entry not enabled */ goto out; bei = FIELD_GET(PCI_EA_BEI, dw0); prop = FIELD_GET(PCI_EA_PP, dw0); /* * If the Property is in the reserved range, try the Secondary * Property instead. */ if (prop > PCI_EA_P_BRIDGE_IO && prop < PCI_EA_P_MEM_RESERVED) prop = FIELD_GET(PCI_EA_SP, dw0); if (prop > PCI_EA_P_BRIDGE_IO) goto out; res = pci_ea_get_resource(dev, bei, prop); res_name = pci_resource_name(dev, bei); if (!res) { pci_err(dev, "Unsupported EA entry BEI: %u\n", bei); goto out; } flags = pci_ea_flags(dev, prop); if (!flags) { pci_err(dev, "Unsupported EA properties: %#x\n", prop); goto out; } /* Read Base */ pci_read_config_dword(dev, ent_offset, &base); start = (base & PCI_EA_FIELD_MASK); ent_offset += 4; /* Read MaxOffset */ pci_read_config_dword(dev, ent_offset, &max_offset); ent_offset += 4; /* Read Base MSBs (if 64-bit entry) */ if (base & PCI_EA_IS_64) { u32 base_upper; pci_read_config_dword(dev, ent_offset, &base_upper); ent_offset += 4; flags |= IORESOURCE_MEM_64; /* entry starts above 32-bit boundary, can't use */ if (!support_64 && base_upper) goto out; if (support_64) start |= ((u64)base_upper << 32); } end = start + (max_offset | 0x03); /* Read MaxOffset MSBs (if 64-bit entry) */ if (max_offset & PCI_EA_IS_64) { u32 max_offset_upper; pci_read_config_dword(dev, ent_offset, &max_offset_upper); ent_offset += 4; flags |= IORESOURCE_MEM_64; /* entry too big, can't use */ if (!support_64 && max_offset_upper) goto out; if (support_64) end += ((u64)max_offset_upper << 32); } if (end < start) { pci_err(dev, "EA Entry crosses address boundary\n"); goto out; } if (ent_size != ent_offset - offset) { pci_err(dev, "EA Entry Size (%d) does not match length read (%d)\n", ent_size, ent_offset - offset); goto out; } res->name = pci_name(dev); res->start = start; res->end = end; res->flags = flags; if (bei <= PCI_EA_BEI_BAR5) pci_info(dev, "%s %pR: from Enhanced Allocation, properties %#02x\n", res_name, res, prop); else if (bei == PCI_EA_BEI_ROM) pci_info(dev, "%s %pR: from Enhanced Allocation, properties %#02x\n", res_name, res, prop); else if (bei >= PCI_EA_BEI_VF_BAR0 && bei <= PCI_EA_BEI_VF_BAR5) pci_info(dev, "%s %pR: from Enhanced Allocation, properties %#02x\n", res_name, res, prop); else pci_info(dev, "BEI %d %pR: from Enhanced Allocation, properties %#02x\n", bei, res, prop); out: return offset + ent_size; } /* Enhanced Allocation Initialization */ void pci_ea_init(struct pci_dev *dev) { int ea; u8 num_ent; int offset; int i; /* find PCI EA capability in list */ ea = pci_find_capability(dev, PCI_CAP_ID_EA); if (!ea) return; /* determine the number of entries */ pci_bus_read_config_byte(dev->bus, dev->devfn, ea + PCI_EA_NUM_ENT, &num_ent); num_ent &= PCI_EA_NUM_ENT_MASK; offset = ea + PCI_EA_FIRST_ENT; /* Skip DWORD 2 for type 1 functions */ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) offset += 4; /* parse each EA entry */ for (i = 0; i < num_ent; ++i) offset = pci_ea_read(dev, offset); } static void pci_add_saved_cap(struct pci_dev *pci_dev, struct pci_cap_saved_state *new_cap) { hlist_add_head(&new_cap->next, &pci_dev->saved_cap_space); } /** * _pci_add_cap_save_buffer - allocate buffer for saving given * capability registers * @dev: the PCI device * @cap: the capability to allocate the buffer for * @extended: Standard or Extended capability ID * @size: requested size of the buffer */ static int _pci_add_cap_save_buffer(struct pci_dev *dev, u16 cap, bool extended, unsigned int size) { int pos; struct pci_cap_saved_state *save_state; if (extended) pos = pci_find_ext_capability(dev, cap); else pos = pci_find_capability(dev, cap); if (!pos) return 0; save_state = kzalloc(sizeof(*save_state) + size, GFP_KERNEL); if (!save_state) return -ENOMEM; save_state->cap.cap_nr = cap; save_state->cap.cap_extended = extended; save_state->cap.size = size; pci_add_saved_cap(dev, save_state); return 0; } int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size) { return _pci_add_cap_save_buffer(dev, cap, false, size); } int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size) { return _pci_add_cap_save_buffer(dev, cap, true, size); } /** * pci_allocate_cap_save_buffers - allocate buffers for saving capabilities * @dev: the PCI device */ void pci_allocate_cap_save_buffers(struct pci_dev *dev) { int error; error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP, PCI_EXP_SAVE_REGS * sizeof(u16)); if (error) pci_err(dev, "unable to preallocate PCI Express save buffer\n"); error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_PCIX, sizeof(u16)); if (error) pci_err(dev, "unable to preallocate PCI-X save buffer\n"); error = pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_LTR, 2 * sizeof(u16)); if (error) pci_err(dev, "unable to allocate suspend buffer for LTR\n"); pci_allocate_vc_save_buffers(dev); } void pci_free_cap_save_buffers(struct pci_dev *dev) { struct pci_cap_saved_state *tmp; struct hlist_node *n; hlist_for_each_entry_safe(tmp, n, &dev->saved_cap_space, next) kfree(tmp); } /** * pci_configure_ari - enable or disable ARI forwarding * @dev: the PCI device * * If @dev and its upstream bridge both support ARI, enable ARI in the * bridge. Otherwise, disable ARI in the bridge. */ void pci_configure_ari(struct pci_dev *dev) { u32 cap; struct pci_dev *bridge; if (pcie_ari_disabled || !pci_is_pcie(dev) || dev->devfn) return; bridge = dev->bus->self; if (!bridge) return; pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &cap); if (!(cap & PCI_EXP_DEVCAP2_ARI)) return; if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ARI)) { pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2, PCI_EXP_DEVCTL2_ARI); bridge->ari_enabled = 1; } else { pcie_capability_clear_word(bridge, PCI_EXP_DEVCTL2, PCI_EXP_DEVCTL2_ARI); bridge->ari_enabled = 0; } } static bool pci_acs_flags_enabled(struct pci_dev *pdev, u16 acs_flags) { int pos; u16 cap, ctrl; pos = pdev->acs_cap; if (!pos) return false; /* * Except for egress control, capabilities are either required * or only required if controllable. Features missing from the * capability field can therefore be assumed as hard-wired enabled. */ pci_read_config_word(pdev, pos + PCI_ACS_CAP, &cap); acs_flags &= (cap | PCI_ACS_EC); pci_read_config_word(pdev, pos + PCI_ACS_CTRL, &ctrl); return (ctrl & acs_flags) == acs_flags; } /** * pci_acs_enabled - test ACS against required flags for a given device * @pdev: device to test * @acs_flags: required PCI ACS flags * * Return true if the device supports the provided flags. Automatically * filters out flags that are not implemented on multifunction devices. * * Note that this interface checks the effective ACS capabilities of the * device rather than the actual capabilities. For instance, most single * function endpoints are not required to support ACS because they have no * opportunity for peer-to-peer access. We therefore return 'true' * regardless of whether the device exposes an ACS capability. This makes * it much easier for callers of this function to ignore the actual type * or topology of the device when testing ACS support. */ bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags) { int ret; ret = pci_dev_specific_acs_enabled(pdev, acs_flags); if (ret >= 0) return ret > 0; /* * Conventional PCI and PCI-X devices never support ACS, either * effectively or actually. The shared bus topology implies that * any device on the bus can receive or snoop DMA. */ if (!pci_is_pcie(pdev)) return false; switch (pci_pcie_type(pdev)) { /* * PCI/X-to-PCIe bridges are not specifically mentioned by the spec, * but since their primary interface is PCI/X, we conservatively * handle them as we would a non-PCIe device. */ case PCI_EXP_TYPE_PCIE_BRIDGE: /* * PCIe 3.0, 6.12.1 excludes ACS on these devices. "ACS is never * applicable... must never implement an ACS Extended Capability...". * This seems arbitrary, but we take a conservative interpretation * of this statement. */ case PCI_EXP_TYPE_PCI_BRIDGE: case PCI_EXP_TYPE_RC_EC: return false; /* * PCIe 3.0, 6.12.1.1 specifies that downstream and root ports should * implement ACS in order to indicate their peer-to-peer capabilities, * regardless of whether they are single- or multi-function devices. */ case PCI_EXP_TYPE_DOWNSTREAM: case PCI_EXP_TYPE_ROOT_PORT: return pci_acs_flags_enabled(pdev, acs_flags); /* * PCIe 3.0, 6.12.1.2 specifies ACS capabilities that should be * implemented by the remaining PCIe types to indicate peer-to-peer * capabilities, but only when they are part of a multifunction * device. The footnote for section 6.12 indicates the specific * PCIe types included here. */ case PCI_EXP_TYPE_ENDPOINT: case PCI_EXP_TYPE_UPSTREAM: case PCI_EXP_TYPE_LEG_END: case PCI_EXP_TYPE_RC_END: if (!pdev->multifunction) break; return pci_acs_flags_enabled(pdev, acs_flags); } /* * PCIe 3.0, 6.12.1.3 specifies no ACS capabilities are applicable * to single function devices with the exception of downstream ports. */ return true; } /** * pci_acs_path_enabled - test ACS flags from start to end in a hierarchy * @start: starting downstream device * @end: ending upstream device or NULL to search to the root bus * @acs_flags: required flags * * Walk up a device tree from start to end testing PCI ACS support. If * any step along the way does not support the required flags, return false. */ bool pci_acs_path_enabled(struct pci_dev *start, struct pci_dev *end, u16 acs_flags) { struct pci_dev *pdev, *parent = start; do { pdev = parent; if (!pci_acs_enabled(pdev, acs_flags)) return false; if (pci_is_root_bus(pdev->bus)) return (end == NULL); parent = pdev->bus->self; } while (pdev != end); return true; } /** * pci_acs_init - Initialize ACS if hardware supports it * @dev: the PCI device */ void pci_acs_init(struct pci_dev *dev) { dev->acs_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS); /* * Attempt to enable ACS regardless of capability because some Root * Ports (e.g. those quirked with *_intel_pch_acs_*) do not have * the standard ACS capability but still support ACS via those * quirks. */ pci_enable_acs(dev); } void pci_rebar_init(struct pci_dev *pdev) { pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR); } /** * pci_rebar_find_pos - find position of resize ctrl reg for BAR * @pdev: PCI device * @bar: BAR to find * * Helper to find the position of the ctrl register for a BAR. * Returns -ENOTSUPP if resizable BARs are not supported at all. * Returns -ENOENT if no ctrl register for the BAR could be found. */ static int pci_rebar_find_pos(struct pci_dev *pdev, int bar) { unsigned int pos, nbars, i; u32 ctrl; if (pci_resource_is_iov(bar)) { pos = pci_iov_vf_rebar_cap(pdev); bar = pci_resource_num_to_vf_bar(bar); } else { pos = pdev->rebar_cap; } if (!pos) return -ENOTSUPP; pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl); for (i = 0; i < nbars; i++, pos += 8) { int bar_idx; pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); bar_idx = FIELD_GET(PCI_REBAR_CTRL_BAR_IDX, ctrl); if (bar_idx == bar) return pos; } return -ENOENT; } /** * pci_rebar_get_possible_sizes - get possible sizes for BAR * @pdev: PCI device * @bar: BAR to query * * Get the possible sizes of a resizable BAR as bitmask defined in the spec * (bit 0=1MB, bit 31=128TB). Returns 0 if BAR isn't resizable. */ u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar) { int pos; u32 cap; pos = pci_rebar_find_pos(pdev, bar); if (pos < 0) return 0; pci_read_config_dword(pdev, pos + PCI_REBAR_CAP, &cap); cap = FIELD_GET(PCI_REBAR_CAP_SIZES, cap); /* Sapphire RX 5600 XT Pulse has an invalid cap dword for BAR 0 */ if (pdev->vendor == PCI_VENDOR_ID_ATI && pdev->device == 0x731f && bar == 0 && cap == 0x700) return 0x3f00; return cap; } EXPORT_SYMBOL(pci_rebar_get_possible_sizes); /** * pci_rebar_get_current_size - get the current size of a BAR * @pdev: PCI device * @bar: BAR to set size to * * Read the size of a BAR from the resizable BAR config. * Returns size if found or negative error code. */ int pci_rebar_get_current_size(struct pci_dev *pdev, int bar) { int pos; u32 ctrl; pos = pci_rebar_find_pos(pdev, bar); if (pos < 0) return pos; pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); return FIELD_GET(PCI_REBAR_CTRL_BAR_SIZE, ctrl); } /** * pci_rebar_set_size - set a new size for a BAR * @pdev: PCI device * @bar: BAR to set size to * @size: new size as defined in the spec (0=1MB, 31=128TB) * * Set the new size of a BAR as defined in the spec. * Returns zero if resizing was successful, error code otherwise. */ int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size) { int pos; u32 ctrl; pos = pci_rebar_find_pos(pdev, bar); if (pos < 0) return pos; pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE; ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size); pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl); return 0; } /** * pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port * @dev: the PCI device * @cap_mask: mask of desired AtomicOp sizes, including one or more of: * PCI_EXP_DEVCAP2_ATOMIC_COMP32 * PCI_EXP_DEVCAP2_ATOMIC_COMP64 * PCI_EXP_DEVCAP2_ATOMIC_COMP128 * * Return 0 if all upstream bridges support AtomicOp routing, egress * blocking is disabled on all upstream ports, and the root port supports * the requested completion capabilities (32-bit, 64-bit and/or 128-bit * AtomicOp completion), or negative otherwise. */ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask) { struct pci_bus *bus = dev->bus; struct pci_dev *bridge; u32 cap, ctl2; /* * Per PCIe r5.0, sec 9.3.5.10, the AtomicOp Requester Enable bit * in Device Control 2 is reserved in VFs and the PF value applies * to all associated VFs. */ if (dev->is_virtfn) return -EINVAL; if (!pci_is_pcie(dev)) return -EINVAL; /* * Per PCIe r4.0, sec 6.15, endpoints and root ports may be * AtomicOp requesters. For now, we only support endpoints as * requesters and root ports as completers. No endpoints as * completers, and no peer-to-peer. */ switch (pci_pcie_type(dev)) { case PCI_EXP_TYPE_ENDPOINT: case PCI_EXP_TYPE_LEG_END: case PCI_EXP_TYPE_RC_END: break; default: return -EINVAL; } while (bus->parent) { bridge = bus->self; pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &cap); switch (pci_pcie_type(bridge)) { /* Ensure switch ports support AtomicOp routing */ case PCI_EXP_TYPE_UPSTREAM: case PCI_EXP_TYPE_DOWNSTREAM: if (!(cap & PCI_EXP_DEVCAP2_ATOMIC_ROUTE)) return -EINVAL; break; /* Ensure root port supports all the sizes we care about */ case PCI_EXP_TYPE_ROOT_PORT: if ((cap & cap_mask) != cap_mask) return -EINVAL; break; } /* Ensure upstream ports don't block AtomicOps on egress */ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) { pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, &ctl2); if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK) return -EINVAL; } bus = bus->parent; } pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, PCI_EXP_DEVCTL2_ATOMIC_REQ); return 0; } EXPORT_SYMBOL(pci_enable_atomic_ops_to_root); /** * pci_release_region - Release a PCI bar * @pdev: PCI device whose resources were previously reserved by * pci_request_region() * @bar: BAR to release * * Releases the PCI I/O and memory resources previously reserved by a * successful call to pci_request_region(). Call this function only * after all use of the PCI regions has ceased. */ void pci_release_region(struct pci_dev *pdev, int bar) { if (!pci_bar_index_is_valid(bar)) return; if (pci_resource_len(pdev, bar) == 0) return; if (pci_resource_flags(pdev, bar) & IORESOURCE_IO) release_region(pci_resource_start(pdev, bar), pci_resource_len(pdev, bar)); else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) release_mem_region(pci_resource_start(pdev, bar), pci_resource_len(pdev, bar)); } EXPORT_SYMBOL(pci_release_region); /** * __pci_request_region - Reserved PCI I/O and memory resource * @pdev: PCI device whose resources are to be reserved * @bar: BAR to be reserved * @name: name of the driver requesting the resource * @exclusive: whether the region access is exclusive or not * * Returns: 0 on success, negative error code on failure. * * Mark the PCI region associated with PCI device @pdev BAR @bar as being * reserved by owner @name. Do not access any address inside the PCI regions * unless this call returns successfully. * * If @exclusive is set, then the region is marked so that userspace * is explicitly not allowed to map the resource via /dev/mem or * sysfs MMIO access. * * Returns 0 on success, or %EBUSY on error. A warning * message is also printed on failure. */ static int __pci_request_region(struct pci_dev *pdev, int bar, const char *name, int exclusive) { if (!pci_bar_index_is_valid(bar)) return -EINVAL; if (pci_resource_len(pdev, bar) == 0) return 0; if (pci_resource_flags(pdev, bar) & IORESOURCE_IO) { if (!request_region(pci_resource_start(pdev, bar), pci_resource_len(pdev, bar), name)) goto err_out; } else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) { if (!__request_mem_region(pci_resource_start(pdev, bar), pci_resource_len(pdev, bar), name, exclusive)) goto err_out; } return 0; err_out: pci_warn(pdev, "BAR %d: can't reserve %pR\n", bar, &pdev->resource[bar]); return -EBUSY; } /** * pci_request_region - Reserve PCI I/O and memory resource * @pdev: PCI device whose resources are to be reserved * @bar: BAR to be reserved * @name: name of the driver requesting the resource * * Returns: 0 on success, negative error code on failure. * * Mark the PCI region associated with PCI device @pdev BAR @bar as being * reserved by owner @name. Do not access any address inside the PCI regions * unless this call returns successfully. * * Returns 0 on success, or %EBUSY on error. A warning * message is also printed on failure. */ int pci_request_region(struct pci_dev *pdev, int bar, const char *name) { return __pci_request_region(pdev, bar, name, 0); } EXPORT_SYMBOL(pci_request_region); /** * pci_release_selected_regions - Release selected PCI I/O and memory resources * @pdev: PCI device whose resources were previously reserved * @bars: Bitmask of BARs to be released * * Release selected PCI I/O and memory resources previously reserved. * Call this function only after all use of the PCI regions has ceased. */ void pci_release_selected_regions(struct pci_dev *pdev, int bars) { int i; for (i = 0; i < PCI_STD_NUM_BARS; i++) if (bars & (1 << i)) pci_release_region(pdev, i); } EXPORT_SYMBOL(pci_release_selected_regions); static int __pci_request_selected_regions(struct pci_dev *pdev, int bars, const char *name, int excl) { int i; for (i = 0; i < PCI_STD_NUM_BARS; i++) if (bars & (1 << i)) if (__pci_request_region(pdev, i, name, excl)) goto err_out; return 0; err_out: while (--i >= 0) if (bars & (1 << i)) pci_release_region(pdev, i); return -EBUSY; } /** * pci_request_selected_regions - Reserve selected PCI I/O and memory resources * @pdev: PCI device whose resources are to be reserved * @bars: Bitmask of BARs to be requested * @name: Name of the driver requesting the resources * * Returns: 0 on success, negative error code on failure. */ int pci_request_selected_regions(struct pci_dev *pdev, int bars, const char *name) { return __pci_request_selected_regions(pdev, bars, name, 0); } EXPORT_SYMBOL(pci_request_selected_regions); /** * pci_request_selected_regions_exclusive - Request regions exclusively * @pdev: PCI device to request regions from * @bars: bit mask of BARs to request * @name: name of the driver requesting the resources * * Returns: 0 on success, negative error code on failure. */ int pci_request_selected_regions_exclusive(struct pci_dev *pdev, int bars, const char *name) { return __pci_request_selected_regions(pdev, bars, name, IORESOURCE_EXCLUSIVE); } EXPORT_SYMBOL(pci_request_selected_regions_exclusive); /** * pci_release_regions - Release reserved PCI I/O and memory resources * @pdev: PCI device whose resources were previously reserved by * pci_request_regions() * * Releases all PCI I/O and memory resources previously reserved by a * successful call to pci_request_regions(). Call this function only * after all use of the PCI regions has ceased. */ void pci_release_regions(struct pci_dev *pdev) { pci_release_selected_regions(pdev, (1 << PCI_STD_NUM_BARS) - 1); } EXPORT_SYMBOL(pci_release_regions); /** * pci_request_regions - Reserve PCI I/O and memory resources * @pdev: PCI device whose resources are to be reserved * @name: name of the driver requesting the resources * * Mark all PCI regions associated with PCI device @pdev as being reserved by * owner @name. Do not access any address inside the PCI regions unless this * call returns successfully. * * Returns 0 on success, or %EBUSY on error. A warning * message is also printed on failure. */ int pci_request_regions(struct pci_dev *pdev, const char *name) { return pci_request_selected_regions(pdev, ((1 << PCI_STD_NUM_BARS) - 1), name); } EXPORT_SYMBOL(pci_request_regions); /** * pci_request_regions_exclusive - Reserve PCI I/O and memory resources * @pdev: PCI device whose resources are to be reserved * @name: name of the driver requesting the resources * * Returns: 0 on success, negative error code on failure. * * Mark all PCI regions associated with PCI device @pdev as being reserved * by owner @name. Do not access any address inside the PCI regions * unless this call returns successfully. * * pci_request_regions_exclusive() will mark the region so that /dev/mem * and the sysfs MMIO access will not be allowed. * * Returns 0 on success, or %EBUSY on error. A warning message is also * printed on failure. */ int pci_request_regions_exclusive(struct pci_dev *pdev, const char *name) { return pci_request_selected_regions_exclusive(pdev, ((1 << PCI_STD_NUM_BARS) - 1), name); } EXPORT_SYMBOL(pci_request_regions_exclusive); /* * Record the PCI IO range (expressed as CPU physical address + size). * Return a negative value if an error has occurred, zero otherwise */ int pci_register_io_range(const struct fwnode_handle *fwnode, phys_addr_t addr, resource_size_t size) { int ret = 0; #ifdef PCI_IOBASE struct logic_pio_hwaddr *range; if (!size || addr + size < addr) return -EINVAL; range = kzalloc(sizeof(*range), GFP_ATOMIC); if (!range) return -ENOMEM; range->fwnode = fwnode; range->size = size; range->hw_start = addr; range->flags = LOGIC_PIO_CPU_MMIO; ret = logic_pio_register_range(range); if (ret) kfree(range); /* Ignore duplicates due to deferred probing */ if (ret == -EEXIST) ret = 0; #endif return ret; } phys_addr_t pci_pio_to_address(unsigned long pio) { #ifdef PCI_IOBASE if (pio < MMIO_UPPER_LIMIT) return logic_pio_to_hwaddr(pio); #endif return (phys_addr_t) OF_BAD_ADDR; } EXPORT_SYMBOL_GPL(pci_pio_to_address); unsigned long __weak pci_address_to_pio(phys_addr_t address) { #ifdef PCI_IOBASE return logic_pio_trans_cpuaddr(address); #else if (address > IO_SPACE_LIMIT) return (unsigned long)-1; return (unsigned long) address; #endif } /** * pci_remap_iospace - Remap the memory mapped I/O space * @res: Resource describing the I/O space * @phys_addr: physical address of range to be mapped * * Remap the memory mapped I/O space described by the @res and the CPU * physical address @phys_addr into virtual address space. Only * architectures that have memory mapped IO functions defined (and the * PCI_IOBASE value defined) should call this function. */ #ifndef pci_remap_iospace int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr) { #if defined(PCI_IOBASE) unsigned long vaddr = (unsigned long)PCI_IOBASE + res->start; if (!(res->flags & IORESOURCE_IO)) return -EINVAL; if (res->end > IO_SPACE_LIMIT) return -EINVAL; return vmap_page_range(vaddr, vaddr + resource_size(res), phys_addr, pgprot_device(PAGE_KERNEL)); #else /* * This architecture does not have memory mapped I/O space, * so this function should never be called */ WARN_ONCE(1, "This architecture does not support memory mapped I/O\n"); return -ENODEV; #endif } EXPORT_SYMBOL(pci_remap_iospace); #endif /** * pci_unmap_iospace - Unmap the memory mapped I/O space * @res: resource to be unmapped * * Unmap the CPU virtual address @res from virtual address space. Only * architectures that have memory mapped IO functions defined (and the * PCI_IOBASE value defined) should call this function. */ void pci_unmap_iospace(struct resource *res) { #if defined(PCI_IOBASE) unsigned long vaddr = (unsigned long)PCI_IOBASE + res->start; vunmap_range(vaddr, vaddr + resource_size(res)); #endif } EXPORT_SYMBOL(pci_unmap_iospace); static void __pci_set_master(struct pci_dev *dev, bool enable) { u16 old_cmd, cmd; pci_read_config_word(dev, PCI_COMMAND, &old_cmd); if (enable) cmd = old_cmd | PCI_COMMAND_MASTER; else cmd = old_cmd & ~PCI_COMMAND_MASTER; if (cmd != old_cmd) { pci_dbg(dev, "%s bus mastering\n", enable ? "enabling" : "disabling"); pci_write_config_word(dev, PCI_COMMAND, cmd); } dev->is_busmaster = enable; } /** * pcibios_setup - process "pci=" kernel boot arguments * @str: string used to pass in "pci=" kernel boot arguments * * Process kernel boot arguments. This is the default implementation. * Architecture specific implementations can override this as necessary. */ char * __weak __init pcibios_setup(char *str) { return str; } /** * pcibios_set_master - enable PCI bus-mastering for device dev * @dev: the PCI device to enable * * Enables PCI bus-mastering for the device. This is the default * implementation. Architecture specific implementations can override * this if necessary. */ void __weak pcibios_set_master(struct pci_dev *dev) { u8 lat; /* The latency timer doesn't apply to PCIe (either Type 0 or Type 1) */ if (pci_is_pcie(dev)) return; pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); if (lat < 16) lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; else if (lat > pcibios_max_latency) lat = pcibios_max_latency; else return; pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); } /** * pci_set_master - enables bus-mastering for device dev * @dev: the PCI device to enable * * Enables bus-mastering on the device and calls pcibios_set_master() * to do the needed arch specific settings. */ void pci_set_master(struct pci_dev *dev) { __pci_set_master(dev, true); pcibios_set_master(dev); } EXPORT_SYMBOL(pci_set_master); /** * pci_clear_master - disables bus-mastering for device dev * @dev: the PCI device to disable */ void pci_clear_master(struct pci_dev *dev) { __pci_set_master(dev, false); } EXPORT_SYMBOL(pci_clear_master); /** * pci_set_cacheline_size - ensure the CACHE_LINE_SIZE register is programmed * @dev: the PCI device for which MWI is to be enabled * * Helper function for pci_set_mwi. * Originally copied from drivers/net/acenic.c. * Copyright 1998-2001 by Jes Sorensen, <jes@trained-monkey.org>. * * RETURNS: An appropriate -ERRNO error value on error, or zero for success. */ int pci_set_cacheline_size(struct pci_dev *dev) { u8 cacheline_size; if (!pci_cache_line_size) return -EINVAL; /* Validate current setting: the PCI_CACHE_LINE_SIZE must be equal to or multiple of the right value. */ pci_read_config_byte(dev, PCI_CACHE_LINE_SIZE, &cacheline_size); if (cacheline_size >= pci_cache_line_size && (cacheline_size % pci_cache_line_size) == 0) return 0; /* Write the correct value. */ pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, pci_cache_line_size); /* Read it back. */ pci_read_config_byte(dev, PCI_CACHE_LINE_SIZE, &cacheline_size); if (cacheline_size == pci_cache_line_size) return 0; pci_dbg(dev, "cache line size of %d is not supported\n", pci_cache_line_size << 2); return -EINVAL; } EXPORT_SYMBOL_GPL(pci_set_cacheline_size); /** * pci_set_mwi - enables memory-write-invalidate PCI transaction * @dev: the PCI device for which MWI is enabled * * Enables the Memory-Write-Invalidate transaction in %PCI_COMMAND. * * RETURNS: An appropriate -ERRNO error value on error, or zero for success. */ int pci_set_mwi(struct pci_dev *dev) { #ifdef PCI_DISABLE_MWI return 0; #else int rc; u16 cmd; rc = pci_set_cacheline_size(dev); if (rc) return rc; pci_read_config_word(dev, PCI_COMMAND, &cmd); if (!(cmd & PCI_COMMAND_INVALIDATE)) { pci_dbg(dev, "enabling Mem-Wr-Inval\n"); cmd |= PCI_COMMAND_INVALIDATE; pci_write_config_word(dev, PCI_COMMAND, cmd); } return 0; #endif } EXPORT_SYMBOL(pci_set_mwi); /** * pci_try_set_mwi - enables memory-write-invalidate PCI transaction * @dev: the PCI device for which MWI is enabled * * Enables the Memory-Write-Invalidate transaction in %PCI_COMMAND. * Callers are not required to check the return value. * * RETURNS: An appropriate -ERRNO error value on error, or zero for success. */ int pci_try_set_mwi(struct pci_dev *dev) { #ifdef PCI_DISABLE_MWI return 0; #else return pci_set_mwi(dev); #endif } EXPORT_SYMBOL(pci_try_set_mwi); /** * pci_clear_mwi - disables Memory-Write-Invalidate for device dev * @dev: the PCI device to disable * * Disables PCI Memory-Write-Invalidate transaction on the device */ void pci_clear_mwi(struct pci_dev *dev) { #ifndef PCI_DISABLE_MWI u16 cmd; pci_read_config_word(dev, PCI_COMMAND, &cmd); if (cmd & PCI_COMMAND_INVALIDATE) { cmd &= ~PCI_COMMAND_INVALIDATE; pci_write_config_word(dev, PCI_COMMAND, cmd); } #endif } EXPORT_SYMBOL(pci_clear_mwi); /** * pci_disable_parity - disable parity checking for device * @dev: the PCI device to operate on * * Disable parity checking for device @dev */ void pci_disable_parity(struct pci_dev *dev) { u16 cmd; pci_read_config_word(dev, PCI_COMMAND, &cmd); if (cmd & PCI_COMMAND_PARITY) { cmd &= ~PCI_COMMAND_PARITY; pci_write_config_word(dev, PCI_COMMAND, cmd); } } /** * pci_intx - enables/disables PCI INTx for device dev * @pdev: the PCI device to operate on * @enable: boolean: whether to enable or disable PCI INTx * * Enables/disables PCI INTx for device @pdev */ void pci_intx(struct pci_dev *pdev, int enable) { u16 pci_command, new; pci_read_config_word(pdev, PCI_COMMAND, &pci_command); if (enable) new = pci_command & ~PCI_COMMAND_INTX_DISABLE; else new = pci_command | PCI_COMMAND_INTX_DISABLE; if (new == pci_command) return; pci_write_config_word(pdev, PCI_COMMAND, new); } EXPORT_SYMBOL_GPL(pci_intx); /** * pci_wait_for_pending_transaction - wait for pending transaction * @dev: the PCI device to operate on * * Return 0 if transaction is pending 1 otherwise. */ int pci_wait_for_pending_transaction(struct pci_dev *dev) { if (!pci_is_pcie(dev)) return 1; return pci_wait_for_pending(dev, pci_pcie_cap(dev) + PCI_EXP_DEVSTA, PCI_EXP_DEVSTA_TRPND); } EXPORT_SYMBOL(pci_wait_for_pending_transaction); /** * pcie_flr - initiate a PCIe function level reset * @dev: device to reset * * Initiate a function level reset unconditionally on @dev without * checking any flags and DEVCAP */ int pcie_flr(struct pci_dev *dev) { if (!pci_wait_for_pending_transaction(dev)) pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n"); pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR); if (dev->imm_ready) return 0; /* * Per PCIe r4.0, sec 6.6.2, a device must complete an FLR within * 100ms, but may silently discard requests while the FLR is in * progress. Wait 100ms before trying to access the device. */ msleep(100); return pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS); } EXPORT_SYMBOL_GPL(pcie_flr); /** * pcie_reset_flr - initiate a PCIe function level reset * @dev: device to reset * @probe: if true, return 0 if device can be reset this way * * Initiate a function level reset on @dev. */ int pcie_reset_flr(struct pci_dev *dev, bool probe) { if (dev->dev_flags & PCI_DEV_FLAGS_NO_FLR_RESET) return -ENOTTY; if (!(dev->devcap & PCI_EXP_DEVCAP_FLR)) return -ENOTTY; if (probe) return 0; return pcie_flr(dev); } EXPORT_SYMBOL_GPL(pcie_reset_flr); static int pci_af_flr(struct pci_dev *dev, bool probe) { int pos; u8 cap; pos = pci_find_capability(dev, PCI_CAP_ID_AF); if (!pos) return -ENOTTY; if (dev->dev_flags & PCI_DEV_FLAGS_NO_FLR_RESET) return -ENOTTY; pci_read_config_byte(dev, pos + PCI_AF_CAP, &cap); if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR)) return -ENOTTY; if (probe) return 0; /* * Wait for Transaction Pending bit to clear. A word-aligned test * is used, so we use the control offset rather than status and shift * the test bit to match. */ if (!pci_wait_for_pending(dev, pos + PCI_AF_CTRL, PCI_AF_STATUS_TP << 8)) pci_err(dev, "timed out waiting for pending transaction; performing AF function level reset anyway\n"); pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR); if (dev->imm_ready) return 0; /* * Per Advanced Capabilities for Conventional PCI ECN, 13 April 2006, * updated 27 July 2006; a device must complete an FLR within * 100ms, but may silently discard requests while the FLR is in * progress. Wait 100ms before trying to access the device. */ msleep(100); return pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS); } /** * pci_pm_reset - Put device into PCI_D3 and back into PCI_D0. * @dev: Device to reset. * @probe: if true, return 0 if the device can be reset this way. * * If @dev supports native PCI PM and its PCI_PM_CTRL_NO_SOFT_RESET flag is * unset, it will be reinitialized internally when going from PCI_D3hot to * PCI_D0. If that's the case and the device is not in a low-power state * already, force it into PCI_D3hot and back to PCI_D0, causing it to be reset. * * NOTE: This causes the caller to sleep for twice the device power transition * cooldown period, which for the D0->D3hot and D3hot->D0 transitions is 10 ms * by default (i.e. unless the @dev's d3hot_delay field has a different value). * Moreover, only devices in D0 can be reset by this function. */ static int pci_pm_reset(struct pci_dev *dev, bool probe) { u16 csr; if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET) return -ENOTTY; pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &csr); if (csr & PCI_PM_CTRL_NO_SOFT_RESET) return -ENOTTY; if (probe) return 0; if (dev->current_state != PCI_D0) return -EINVAL; csr &= ~PCI_PM_CTRL_STATE_MASK; csr |= PCI_D3hot; pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); pci_dev_d3_sleep(dev); csr &= ~PCI_PM_CTRL_STATE_MASK; csr |= PCI_D0; pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); pci_dev_d3_sleep(dev); return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS); } /** * pcie_wait_for_link_status - Wait for link status change * @pdev: Device whose link to wait for. * @use_lt: Use the LT bit if TRUE, or the DLLLA bit if FALSE. * @active: Waiting for active or inactive? * * Return 0 if successful, or -ETIMEDOUT if status has not changed within * PCIE_LINK_RETRAIN_TIMEOUT_MS milliseconds. */ static int pcie_wait_for_link_status(struct pci_dev *pdev, bool use_lt, bool active) { u16 lnksta_mask, lnksta_match; unsigned long end_jiffies; u16 lnksta; lnksta_mask = use_lt ? PCI_EXP_LNKSTA_LT : PCI_EXP_LNKSTA_DLLLA; lnksta_match = active ? lnksta_mask : 0; end_jiffies = jiffies + msecs_to_jiffies(PCIE_LINK_RETRAIN_TIMEOUT_MS); do { pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta); if ((lnksta & lnksta_mask) == lnksta_match) return 0; msleep(1); } while (time_before(jiffies, end_jiffies)); return -ETIMEDOUT; } /** * pcie_retrain_link - Request a link retrain and wait for it to complete * @pdev: Device whose link to retrain. * @use_lt: Use the LT bit if TRUE, or the DLLLA bit if FALSE, for status. * * Trigger retraining of the PCIe Link and wait for the completion of the * retraining. As link retraining is known to asserts LBMS and may change * the Link Speed, LBMS is cleared after the retraining and the Link Speed * of the subordinate bus is updated. * * Retrain completion status is retrieved from the Link Status Register * according to @use_lt. It is not verified whether the use of the DLLLA * bit is valid. * * Return 0 if successful, or -ETIMEDOUT if training has not completed * within PCIE_LINK_RETRAIN_TIMEOUT_MS milliseconds. */ int pcie_retrain_link(struct pci_dev *pdev, bool use_lt) { int rc; /* * Ensure the updated LNKCTL parameters are used during link * training by checking that there is no ongoing link training that * may have started before link parameters were changed, so as to * avoid LTSSM race as recommended in Implementation Note at the end * of PCIe r6.1 sec 7.5.3.7. */ rc = pcie_wait_for_link_status(pdev, true, false); if (rc) return rc; pcie_capability_set_word(pdev, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_RL); if (pdev->clear_retrain_link) { /* * Due to an erratum in some devices the Retrain Link bit * needs to be cleared again manually to allow the link * training to succeed. */ pcie_capability_clear_word(pdev, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_RL); } rc = pcie_wait_for_link_status(pdev, use_lt, !use_lt); /* * Clear LBMS after a manual retrain so that the bit can be used * to track link speed or width changes made by hardware itself * in attempt to correct unreliable link operation. */ pcie_reset_lbms(pdev); /* * Ensure the Link Speed updates after retraining in case the Link * Speed was changed because of the retraining. While the bwctrl's * IRQ handler normally picks up the new Link Speed, clearing LBMS * races with the IRQ handler reading the Link Status register and * can result in the handler returning early without updating the * Link Speed. */ if (pdev->subordinate) pcie_update_link_speed(pdev->subordinate); return rc; } /** * pcie_wait_for_link_delay - Wait until link is active or inactive * @pdev: Bridge device * @active: waiting for active or inactive? * @delay: Delay to wait after link has become active (in ms) * * Use this to wait till link becomes active or inactive. */ static bool pcie_wait_for_link_delay(struct pci_dev *pdev, bool active, int delay) { int rc; /* * Some controllers might not implement link active reporting. In this * case, we wait for 1000 ms + any delay requested by the caller. */ if (!pdev->link_active_reporting) { msleep(PCIE_LINK_RETRAIN_TIMEOUT_MS + delay); return true; } /* * PCIe r4.0 sec 6.6.1, a component must enter LTSSM Detect within 20ms, * after which we should expect the link to be active if the reset was * successful. If so, software must wait a minimum 100ms before sending * configuration requests to devices downstream this port. * * If the link fails to activate, either the device was physically * removed or the link is permanently failed. */ if (active) msleep(20); rc = pcie_wait_for_link_status(pdev, false, active); if (active) { if (rc) rc = pcie_failed_link_retrain(pdev); if (rc) return false; msleep(delay); return true; } if (rc) return false; return true; } /** * pcie_wait_for_link - Wait until link is active or inactive * @pdev: Bridge device * @active: waiting for active or inactive? * * Use this to wait till link becomes active or inactive. */ bool pcie_wait_for_link(struct pci_dev *pdev, bool active) { return pcie_wait_for_link_delay(pdev, active, 100); } /* * Find maximum D3cold delay required by all the devices on the bus. The * spec says 100 ms, but firmware can lower it and we allow drivers to * increase it as well. * * Called with @pci_bus_sem locked for reading. */ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus) { const struct pci_dev *pdev; int min_delay = 100; int max_delay = 0; list_for_each_entry(pdev, &bus->devices, bus_list) { if (pdev->d3cold_delay < min_delay) min_delay = pdev->d3cold_delay; if (pdev->d3cold_delay > max_delay) max_delay = pdev->d3cold_delay; } return max(min_delay, max_delay); } /** * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible * @dev: PCI bridge * @reset_type: reset type in human-readable form * * Handle necessary delays before access to the devices on the secondary * side of the bridge are permitted after D3cold to D0 transition * or Conventional Reset. * * For PCIe this means the delays in PCIe 5.0 section 6.6.1. For * conventional PCI it means Tpvrh + Trhfa specified in PCI 3.0 section * 4.3.2. * * Return 0 on success or -ENOTTY if the first device on the secondary bus * failed to become accessible. */ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type) { struct pci_dev *child __free(pci_dev_put) = NULL; int delay; if (pci_dev_is_disconnected(dev)) return 0; if (!pci_is_bridge(dev)) return 0; down_read(&pci_bus_sem); /* * We only deal with devices that are present currently on the bus. * For any hot-added devices the access delay is handled in pciehp * board_added(). In case of ACPI hotplug the firmware is expected * to configure the devices before OS is notified. */ if (!dev->subordinate || list_empty(&dev->subordinate->devices)) { up_read(&pci_bus_sem); return 0; } /* Take d3cold_delay requirements into account */ delay = pci_bus_max_d3cold_delay(dev->subordinate); if (!delay) { up_read(&pci_bus_sem); return 0; } child = pci_dev_get(list_first_entry(&dev->subordinate->devices, struct pci_dev, bus_list)); up_read(&pci_bus_sem); /* * Conventional PCI and PCI-X we need to wait Tpvrh + Trhfa before * accessing the device after reset (that is 1000 ms + 100 ms). */ if (!pci_is_pcie(dev)) { pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay); msleep(1000 + delay); return 0; } /* * For PCIe downstream and root ports that do not support speeds * greater than 5 GT/s need to wait minimum 100 ms. For higher * speeds (gen3) we need to wait first for the data link layer to * become active. * * However, 100 ms is the minimum and the PCIe spec says the * software must allow at least 1s before it can determine that the * device that did not respond is a broken device. Also device can * take longer than that to respond if it indicates so through Request * Retry Status completions. * * Therefore we wait for 100 ms and check for the device presence * until the timeout expires. */ if (!pcie_downstream_port(dev)) return 0; if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) { u16 status; pci_dbg(dev, "waiting %d ms for downstream link\n", delay); msleep(delay); if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay)) return 0; /* * If the port supports active link reporting we now check * whether the link is active and if not bail out early with * the assumption that the device is not present anymore. */ if (!dev->link_active_reporting) return -ENOTTY; pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status); if (!(status & PCI_EXP_LNKSTA_DLLLA)) return -ENOTTY; return pci_dev_wait(child, reset_type, PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT); } pci_dbg(dev, "waiting %d ms for downstream link, after activation\n", delay); if (!pcie_wait_for_link_delay(dev, true, delay)) { /* Did not train, no need to wait any further */ pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay); return -ENOTTY; } return pci_dev_wait(child, reset_type, PCIE_RESET_READY_POLL_MS - delay); } void pci_reset_secondary_bus(struct pci_dev *dev) { u16 ctrl; pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl); ctrl |= PCI_BRIDGE_CTL_BUS_RESET; pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); /* * PCI spec v3.0 7.6.4.2 requires minimum Trst of 1ms. Double * this to 2ms to ensure that we meet the minimum requirement. */ msleep(2); ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); } void __weak pcibios_reset_secondary_bus(struct pci_dev *dev) { pci_reset_secondary_bus(dev); } /** * pci_bridge_secondary_bus_reset - Reset the secondary bus on a PCI bridge. * @dev: Bridge device * * Use the bridge control register to assert reset on the secondary bus. * Devices on the secondary bus are left in power-on state. */ int pci_bridge_secondary_bus_reset(struct pci_dev *dev) { if (!dev->block_cfg_access) pci_warn_once(dev, "unlocked secondary bus reset via: %pS\n", __builtin_return_address(0)); pcibios_reset_secondary_bus(dev); return pci_bridge_wait_for_secondary_bus(dev, "bus reset"); } EXPORT_SYMBOL_GPL(pci_bridge_secondary_bus_reset); static int pci_parent_bus_reset(struct pci_dev *dev, bool probe) { struct pci_dev *pdev; if (pci_is_root_bus(dev->bus) || dev->subordinate || !dev->bus->self || dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET) return -ENOTTY; list_for_each_entry(pdev, &dev->bus->devices, bus_list) if (pdev != dev) return -ENOTTY; if (probe) return 0; return pci_bridge_secondary_bus_reset(dev->bus->self); } static int pci_reset_hotplug_slot(struct hotplug_slot *hotplug, bool probe) { int rc = -ENOTTY; if (!hotplug || !try_module_get(hotplug->owner)) return rc; if (hotplug->ops->reset_slot) rc = hotplug->ops->reset_slot(hotplug, probe); module_put(hotplug->owner); return rc; } static int pci_dev_reset_slot_function(struct pci_dev *dev, bool probe) { if (dev->multifunction || dev->subordinate || !dev->slot || dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET) return -ENOTTY; return pci_reset_hotplug_slot(dev->slot->hotplug, probe); } static u16 cxl_port_dvsec(struct pci_dev *dev) { return pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_PORT); } static bool cxl_sbr_masked(struct pci_dev *dev) { u16 dvsec, reg; int rc; dvsec = cxl_port_dvsec(dev); if (!dvsec) return false; rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_PORT_CTL, ®); if (rc || PCI_POSSIBLE_ERROR(reg)) return false; /* * Per CXL spec r3.1, sec 8.1.5.2, when "Unmask SBR" is 0, the SBR * bit in Bridge Control has no effect. When 1, the Port generates * hot reset when the SBR bit is set to 1. */ if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) return false; return true; } static int pci_reset_bus_function(struct pci_dev *dev, bool probe) { struct pci_dev *bridge = pci_upstream_bridge(dev); int rc; /* * If "dev" is below a CXL port that has SBR control masked, SBR * won't do anything, so return error. */ if (bridge && cxl_sbr_masked(bridge)) { if (probe) return 0; return -ENOTTY; } rc = pci_dev_reset_slot_function(dev, probe); if (rc != -ENOTTY) return rc; return pci_parent_bus_reset(dev, probe); } static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) { struct pci_dev *bridge; u16 dvsec, reg, val; int rc; bridge = pci_upstream_bridge(dev); if (!bridge) return -ENOTTY; dvsec = cxl_port_dvsec(bridge); if (!dvsec) return -ENOTTY; if (probe) return 0; rc = pci_read_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL, ®); if (rc) return -ENOTTY; if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) { val = reg; } else { val = reg | PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR; pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL, val); } rc = pci_reset_bus_function(dev, probe); if (reg != val) pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL, reg); return rc; } void pci_dev_lock(struct pci_dev *dev) { /* block PM suspend, driver probe, etc. */ device_lock(&dev->dev); pci_cfg_access_lock(dev); } EXPORT_SYMBOL_GPL(pci_dev_lock); /* Return 1 on successful lock, 0 on contention */ int pci_dev_trylock(struct pci_dev *dev) { if (device_trylock(&dev->dev)) { if (pci_cfg_access_trylock(dev)) return 1; device_unlock(&dev->dev); } return 0; } EXPORT_SYMBOL_GPL(pci_dev_trylock); void pci_dev_unlock(struct pci_dev *dev) { pci_cfg_access_unlock(dev); device_unlock(&dev->dev); } EXPORT_SYMBOL_GPL(pci_dev_unlock); static void pci_dev_save_and_disable(struct pci_dev *dev) { const struct pci_error_handlers *err_handler = dev->driver ? dev->driver->err_handler : NULL; /* * dev->driver->err_handler->reset_prepare() is protected against * races with ->remove() by the device lock, which must be held by * the caller. */ if (err_handler && err_handler->reset_prepare) err_handler->reset_prepare(dev); else if (dev->driver) pci_warn(dev, "resetting"); /* * Wake-up device prior to save. PM registers default to D0 after * reset and a simple register restore doesn't reliably return * to a non-D0 state anyway. */ pci_set_power_state(dev, PCI_D0); pci_save_state(dev); /* * Disable the device by clearing the Command register, except for * INTx-disable which is set. This not only disables MMIO and I/O port * BARs, but also prevents the device from being Bus Master, preventing * DMA from the device including MSI/MSI-X interrupts. For PCI 2.3 * compliant devices, INTx-disable prevents legacy interrupts. */ pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); } static void pci_dev_restore(struct pci_dev *dev) { const struct pci_error_handlers *err_handler = dev->driver ? dev->driver->err_handler : NULL; pci_restore_state(dev); /* * dev->driver->err_handler->reset_done() is protected against * races with ->remove() by the device lock, which must be held by * the caller. */ if (err_handler && err_handler->reset_done) err_handler->reset_done(dev); else if (dev->driver) pci_warn(dev, "reset done"); } /* dev->reset_methods[] is a 0-terminated list of indices into this array */ const struct pci_reset_fn_method pci_reset_fn_methods[] = { { }, { pci_dev_specific_reset, .name = "device_specific" }, { pci_dev_acpi_reset, .name = "acpi" }, { pcie_reset_flr, .name = "flr" }, { pci_af_flr, .name = "af_flr" }, { pci_pm_reset, .name = "pm" }, { pci_reset_bus_function, .name = "bus" }, { cxl_reset_bus_function, .name = "cxl_bus" }, }; /** * __pci_reset_function_locked - reset a PCI device function while holding * the @dev mutex lock. * @dev: PCI device to reset * * Some devices allow an individual function to be reset without affecting * other functions in the same device. The PCI device must be responsive * to PCI config space in order to use this function. * * The device function is presumed to be unused and the caller is holding * the device mutex lock when this function is called. * * Resetting the device will make the contents of PCI configuration space * random, so any caller of this must be prepared to reinitialise the * device including MSI, bus mastering, BARs, decoding IO and memory spaces, * etc. * * Returns 0 if the device function was successfully reset or negative if the * device doesn't support resetting a single function. */ int __pci_reset_function_locked(struct pci_dev *dev) { int i, m, rc; const struct pci_reset_fn_method *method; might_sleep(); /* * A reset method returns -ENOTTY if it doesn't support this device and * we should try the next method. * * If it returns 0 (success), we're finished. If it returns any other * error, we're also finished: this indicates that further reset * mechanisms might be broken on the device. */ for (i = 0; i < PCI_NUM_RESET_METHODS; i++) { m = dev->reset_methods[i]; if (!m) return -ENOTTY; method = &pci_reset_fn_methods[m]; pci_dbg(dev, "reset via %s\n", method->name); rc = method->reset_fn(dev, PCI_RESET_DO_RESET); if (!rc) return 0; pci_dbg(dev, "%s failed with %d\n", method->name, rc); if (rc != -ENOTTY) return rc; } return -ENOTTY; } EXPORT_SYMBOL_GPL(__pci_reset_function_locked); /** * pci_init_reset_methods - check whether device can be safely reset * and store supported reset mechanisms. * @dev: PCI device to check for reset mechanisms * * Some devices allow an individual function to be reset without affecting * other functions in the same device. The PCI device must be in D0-D3hot * state. * * Stores reset mechanisms supported by device in reset_methods byte array * which is a member of struct pci_dev. */ void pci_init_reset_methods(struct pci_dev *dev) { int m, i, rc; BUILD_BUG_ON(ARRAY_SIZE(pci_reset_fn_methods) != PCI_NUM_RESET_METHODS); might_sleep(); i = 0; for (m = 1; m < PCI_NUM_RESET_METHODS; m++) { rc = pci_reset_fn_methods[m].reset_fn(dev, PCI_RESET_PROBE); if (!rc) dev->reset_methods[i++] = m; else if (rc != -ENOTTY) break; } dev->reset_methods[i] = 0; } /** * pci_reset_function - quiesce and reset a PCI device function * @dev: PCI device to reset * * Some devices allow an individual function to be reset without affecting * other functions in the same device. The PCI device must be responsive * to PCI config space in order to use this function. * * This function does not just reset the PCI portion of a device, but * clears all the state associated with the device. This function differs * from __pci_reset_function_locked() in that it saves and restores device state * over the reset and takes the PCI device lock. * * Returns 0 if the device function was successfully reset or negative if the * device doesn't support resetting a single function. */ int pci_reset_function(struct pci_dev *dev) { struct pci_dev *bridge; int rc; if (!pci_reset_supported(dev)) return -ENOTTY; /* * If there's no upstream bridge, no locking is needed since there is * no upstream bridge configuration to hold consistent. */ bridge = pci_upstream_bridge(dev); if (bridge) pci_dev_lock(bridge); pci_dev_lock(dev); pci_dev_save_and_disable(dev); rc = __pci_reset_function_locked(dev); pci_dev_restore(dev); pci_dev_unlock(dev); if (bridge) pci_dev_unlock(bridge); return rc; } EXPORT_SYMBOL_GPL(pci_reset_function); /** * pci_reset_function_locked - quiesce and reset a PCI device function * @dev: PCI device to reset * * Some devices allow an individual function to be reset without affecting * other functions in the same device. The PCI device must be responsive * to PCI config space in order to use this function. * * This function does not just reset the PCI portion of a device, but * clears all the state associated with the device. This function differs * from __pci_reset_function_locked() in that it saves and restores device state * over the reset. It also differs from pci_reset_function() in that it * requires the PCI device lock to be held. * * Returns 0 if the device function was successfully reset or negative if the * device doesn't support resetting a single function. */ int pci_reset_function_locked(struct pci_dev *dev) { int rc; if (!pci_reset_supported(dev)) return -ENOTTY; pci_dev_save_and_disable(dev); rc = __pci_reset_function_locked(dev); pci_dev_restore(dev); return rc; } EXPORT_SYMBOL_GPL(pci_reset_function_locked); /** * pci_try_reset_function - quiesce and reset a PCI device function * @dev: PCI device to reset * * Same as above, except return -EAGAIN if unable to lock device. */ int pci_try_reset_function(struct pci_dev *dev) { int rc; if (!pci_reset_supported(dev)) return -ENOTTY; if (!pci_dev_trylock(dev)) return -EAGAIN; pci_dev_save_and_disable(dev); rc = __pci_reset_function_locked(dev); pci_dev_restore(dev); pci_dev_unlock(dev); return rc; } EXPORT_SYMBOL_GPL(pci_try_reset_function); /* Do any devices on or below this bus prevent a bus reset? */ static bool pci_bus_resettable(struct pci_bus *bus) { struct pci_dev *dev; if (bus->self && (bus->self->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET)) return false; list_for_each_entry(dev, &bus->devices, bus_list) { if (dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET || (dev->subordinate && !pci_bus_resettable(dev->subordinate))) return false; } return true; } /* Lock devices from the top of the tree down */ static void pci_bus_lock(struct pci_bus *bus) { struct pci_dev *dev; pci_dev_lock(bus->self); list_for_each_entry(dev, &bus->devices, bus_list) { if (dev->subordinate) pci_bus_lock(dev->subordinate); else pci_dev_lock(dev); } } /* Unlock devices from the bottom of the tree up */ static void pci_bus_unlock(struct pci_bus *bus) { struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { if (dev->subordinate) pci_bus_unlock(dev->subordinate); else pci_dev_unlock(dev); } pci_dev_unlock(bus->self); } /* Return 1 on successful lock, 0 on contention */ static int pci_bus_trylock(struct pci_bus *bus) { struct pci_dev *dev; if (!pci_dev_trylock(bus->self)) return 0; list_for_each_entry(dev, &bus->devices, bus_list) { if (dev->subordinate) { if (!pci_bus_trylock(dev->subordinate)) goto unlock; } else if (!pci_dev_trylock(dev)) goto unlock; } return 1; unlock: list_for_each_entry_continue_reverse(dev, &bus->devices, bus_list) { if (dev->subordinate) pci_bus_unlock(dev->subordinate); else pci_dev_unlock(dev); } pci_dev_unlock(bus->self); return 0; } /* Do any devices on or below this slot prevent a bus reset? */ static bool pci_slot_resettable(struct pci_slot *slot) { struct pci_dev *dev; if (slot->bus->self && (slot->bus->self->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET)) return false; list_for_each_entry(dev, &slot->bus->devices, bus_list) { if (!dev->slot || dev->slot != slot) continue; if (dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET || (dev->subordinate && !pci_bus_resettable(dev->subordinate))) return false; } return true; } /* Lock devices from the top of the tree down */ static void pci_slot_lock(struct pci_slot *slot) { struct pci_dev *dev; list_for_each_entry(dev, &slot->bus->devices, bus_list) { if (!dev->slot || dev->slot != slot) continue; if (dev->subordinate) pci_bus_lock(dev->subordinate); else pci_dev_lock(dev); } } /* Unlock devices from the bottom of the tree up */ static void pci_slot_unlock(struct pci_slot *slot) { struct pci_dev *dev; list_for_each_entry(dev, &slot->bus->devices, bus_list) { if (!dev->slot || dev->slot != slot) continue; if (dev->subordinate) pci_bus_unlock(dev->subordinate); else pci_dev_unlock(dev); } } /* Return 1 on successful lock, 0 on contention */ static int pci_slot_trylock(struct pci_slot *slot) { struct pci_dev *dev; list_for_each_entry(dev, &slot->bus->devices, bus_list) { if (!dev->slot || dev->slot != slot) continue; if (dev->subordinate) { if (!pci_bus_trylock(dev->subordinate)) { pci_dev_unlock(dev); goto unlock; } } else if (!pci_dev_trylock(dev)) goto unlock; } return 1; unlock: list_for_each_entry_continue_reverse(dev, &slot->bus->devices, bus_list) { if (!dev->slot || dev->slot != slot) continue; if (dev->subordinate) pci_bus_unlock(dev->subordinate); else pci_dev_unlock(dev); } return 0; } /* * Save and disable devices from the top of the tree down while holding * the @dev mutex lock for the entire tree. */ static void pci_bus_save_and_disable_locked(struct pci_bus *bus) { struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { pci_dev_save_and_disable(dev); if (dev->subordinate) pci_bus_save_and_disable_locked(dev->subordinate); } } /* * Restore devices from top of the tree down while holding @dev mutex lock * for the entire tree. Parent bridges need to be restored before we can * get to subordinate devices. */ static void pci_bus_restore_locked(struct pci_bus *bus) { struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { pci_dev_restore(dev); if (dev->subordinate) { pci_bridge_wait_for_secondary_bus(dev, "bus reset"); pci_bus_restore_locked(dev->subordinate); } } } /* * Save and disable devices from the top of the tree down while holding * the @dev mutex lock for the entire tree. */ static void pci_slot_save_and_disable_locked(struct pci_slot *slot) { struct pci_dev *dev; list_for_each_entry(dev, &slot->bus->devices, bus_list) { if (!dev->slot || dev->slot != slot) continue; pci_dev_save_and_disable(dev); if (dev->subordinate) pci_bus_save_and_disable_locked(dev->subordinate); } } /* * Restore devices from top of the tree down while holding @dev mutex lock * for the entire tree. Parent bridges need to be restored before we can * get to subordinate devices. */ static void pci_slot_restore_locked(struct pci_slot *slot) { struct pci_dev *dev; list_for_each_entry(dev, &slot->bus->devices, bus_list) { if (!dev->slot || dev->slot != slot) continue; pci_dev_restore(dev); if (dev->subordinate) { pci_bridge_wait_for_secondary_bus(dev, "slot reset"); pci_bus_restore_locked(dev->subordinate); } } } static int pci_slot_reset(struct pci_slot *slot, bool probe) { int rc; if (!slot || !pci_slot_resettable(slot)) return -ENOTTY; if (!probe) pci_slot_lock(slot); might_sleep(); rc = pci_reset_hotplug_slot(slot->hotplug, probe); if (!probe) pci_slot_unlock(slot); return rc; } /** * pci_probe_reset_slot - probe whether a PCI slot can be reset * @slot: PCI slot to probe * * Return 0 if slot can be reset, negative if a slot reset is not supported. */ int pci_probe_reset_slot(struct pci_slot *slot) { return pci_slot_reset(slot, PCI_RESET_PROBE); } EXPORT_SYMBOL_GPL(pci_probe_reset_slot); /** * __pci_reset_slot - Try to reset a PCI slot * @slot: PCI slot to reset * * A PCI bus may host multiple slots, each slot may support a reset mechanism * independent of other slots. For instance, some slots may support slot power * control. In the case of a 1:1 bus to slot architecture, this function may * wrap the bus reset to avoid spurious slot related events such as hotplug. * Generally a slot reset should be attempted before a bus reset. All of the * function of the slot and any subordinate buses behind the slot are reset * through this function. PCI config space of all devices in the slot and * behind the slot is saved before and restored after reset. * * Same as above except return -EAGAIN if the slot cannot be locked */ static int __pci_reset_slot(struct pci_slot *slot) { int rc; rc = pci_slot_reset(slot, PCI_RESET_PROBE); if (rc) return rc; if (pci_slot_trylock(slot)) { pci_slot_save_and_disable_locked(slot); might_sleep(); rc = pci_reset_hotplug_slot(slot->hotplug, PCI_RESET_DO_RESET); pci_slot_restore_locked(slot); pci_slot_unlock(slot); } else rc = -EAGAIN; return rc; } static int pci_bus_reset(struct pci_bus *bus, bool probe) { int ret; if (!bus->self || !pci_bus_resettable(bus)) return -ENOTTY; if (probe) return 0; pci_bus_lock(bus); might_sleep(); ret = pci_bridge_secondary_bus_reset(bus->self); pci_bus_unlock(bus); return ret; } /** * pci_bus_error_reset - reset the bridge's subordinate bus * @bridge: The parent device that connects to the bus to reset * * This function will first try to reset the slots on this bus if the method is * available. If slot reset fails or is not available, this will fall back to a * secondary bus reset. */ int pci_bus_error_reset(struct pci_dev *bridge) { struct pci_bus *bus = bridge->subordinate; struct pci_slot *slot; if (!bus) return -ENOTTY; mutex_lock(&pci_slot_mutex); if (list_empty(&bus->slots)) goto bus_reset; list_for_each_entry(slot, &bus->slots, list) if (pci_probe_reset_slot(slot)) goto bus_reset; list_for_each_entry(slot, &bus->slots, list) if (pci_slot_reset(slot, PCI_RESET_DO_RESET)) goto bus_reset; mutex_unlock(&pci_slot_mutex); return 0; bus_reset: mutex_unlock(&pci_slot_mutex); return pci_bus_reset(bridge->subordinate, PCI_RESET_DO_RESET); } /** * pci_probe_reset_bus - probe whether a PCI bus can be reset * @bus: PCI bus to probe * * Return 0 if bus can be reset, negative if a bus reset is not supported. */ int pci_probe_reset_bus(struct pci_bus *bus) { return pci_bus_reset(bus, PCI_RESET_PROBE); } EXPORT_SYMBOL_GPL(pci_probe_reset_bus); /** * __pci_reset_bus - Try to reset a PCI bus * @bus: top level PCI bus to reset * * Same as above except return -EAGAIN if the bus cannot be locked */ int __pci_reset_bus(struct pci_bus *bus) { int rc; rc = pci_bus_reset(bus, PCI_RESET_PROBE); if (rc) return rc; if (pci_bus_trylock(bus)) { pci_bus_save_and_disable_locked(bus); might_sleep(); rc = pci_bridge_secondary_bus_reset(bus->self); pci_bus_restore_locked(bus); pci_bus_unlock(bus); } else rc = -EAGAIN; return rc; } /** * pci_reset_bus - Try to reset a PCI bus * @pdev: top level PCI device to reset via slot/bus * * Same as above except return -EAGAIN if the bus cannot be locked */ int pci_reset_bus(struct pci_dev *pdev) { return (!pci_probe_reset_slot(pdev->slot)) ? __pci_reset_slot(pdev->slot) : __pci_reset_bus(pdev->bus); } EXPORT_SYMBOL_GPL(pci_reset_bus); /** * pcix_get_max_mmrbc - get PCI-X maximum designed memory read byte count * @dev: PCI device to query * * Returns mmrbc: maximum designed memory read count in bytes or * appropriate error value. */ int pcix_get_max_mmrbc(struct pci_dev *dev) { int cap; u32 stat; cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); if (!cap) return -EINVAL; if (pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat)) return -EINVAL; return 512 << FIELD_GET(PCI_X_STATUS_MAX_READ, stat); } EXPORT_SYMBOL(pcix_get_max_mmrbc); /** * pcix_get_mmrbc - get PCI-X maximum memory read byte count * @dev: PCI device to query * * Returns mmrbc: maximum memory read count in bytes or appropriate error * value. */ int pcix_get_mmrbc(struct pci_dev *dev) { int cap; u16 cmd; cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); if (!cap) return -EINVAL; if (pci_read_config_word(dev, cap + PCI_X_CMD, &cmd)) return -EINVAL; return 512 << FIELD_GET(PCI_X_CMD_MAX_READ, cmd); } EXPORT_SYMBOL(pcix_get_mmrbc); /** * pcix_set_mmrbc - set PCI-X maximum memory read byte count * @dev: PCI device to query * @mmrbc: maximum memory read count in bytes * valid values are 512, 1024, 2048, 4096 * * If possible sets maximum memory read byte count, some bridges have errata * that prevent this. */ int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc) { int cap; u32 stat, v, o; u16 cmd; if (mmrbc < 512 || mmrbc > 4096 || !is_power_of_2(mmrbc)) return -EINVAL; v = ffs(mmrbc) - 10; cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); if (!cap) return -EINVAL; if (pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat)) return -EINVAL; if (v > FIELD_GET(PCI_X_STATUS_MAX_READ, stat)) return -E2BIG; if (pci_read_config_word(dev, cap + PCI_X_CMD, &cmd)) return -EINVAL; o = FIELD_GET(PCI_X_CMD_MAX_READ, cmd); if (o != v) { if (v > o && (dev->bus->bus_flags & PCI_BUS_FLAGS_NO_MMRBC)) return -EIO; cmd &= ~PCI_X_CMD_MAX_READ; cmd |= FIELD_PREP(PCI_X_CMD_MAX_READ, v); if (pci_write_config_word(dev, cap + PCI_X_CMD, cmd)) return -EIO; } return 0; } EXPORT_SYMBOL(pcix_set_mmrbc); /** * pcie_get_readrq - get PCI Express read request size * @dev: PCI device to query * * Returns maximum memory read request in bytes or appropriate error value. */ int pcie_get_readrq(struct pci_dev *dev) { u16 ctl; pcie_capability_read_word(dev, PCI_EXP_DEVCTL, &ctl); return 128 << FIELD_GET(PCI_EXP_DEVCTL_READRQ, ctl); } EXPORT_SYMBOL(pcie_get_readrq); /** * pcie_set_readrq - set PCI Express maximum memory read request * @dev: PCI device to query * @rq: maximum memory read count in bytes * valid values are 128, 256, 512, 1024, 2048, 4096 * * If possible sets maximum memory read request in bytes */ int pcie_set_readrq(struct pci_dev *dev, int rq) { u16 v; int ret; unsigned int firstbit; struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus); if (rq < 128 || rq > 4096 || !is_power_of_2(rq)) return -EINVAL; /* * If using the "performance" PCIe config, we clamp the read rq * size to the max packet size to keep the host bridge from * generating requests larger than we can cope with. */ if (pcie_bus_config == PCIE_BUS_PERFORMANCE) { int mps = pcie_get_mps(dev); if (mps < rq) rq = mps; } firstbit = ffs(rq); if (firstbit < 8) return -EINVAL; v = FIELD_PREP(PCI_EXP_DEVCTL_READRQ, firstbit - 8); if (bridge->no_inc_mrrs) { int max_mrrs = pcie_get_readrq(dev); if (rq > max_mrrs) { pci_info(dev, "can't set Max_Read_Request_Size to %d; max is %d\n", rq, max_mrrs); return -EINVAL; } } ret = pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_READRQ, v); return pcibios_err_to_errno(ret); } EXPORT_SYMBOL(pcie_set_readrq); /** * pcie_get_mps - get PCI Express maximum payload size * @dev: PCI device to query * * Returns maximum payload size in bytes */ int pcie_get_mps(struct pci_dev *dev) { u16 ctl; pcie_capability_read_word(dev, PCI_EXP_DEVCTL, &ctl); return 128 << FIELD_GET(PCI_EXP_DEVCTL_PAYLOAD, ctl); } EXPORT_SYMBOL(pcie_get_mps); /** * pcie_set_mps - set PCI Express maximum payload size * @dev: PCI device to query * @mps: maximum payload size in bytes * valid values are 128, 256, 512, 1024, 2048, 4096 * * If possible sets maximum payload size */ int pcie_set_mps(struct pci_dev *dev, int mps) { u16 v; int ret; if (mps < 128 || mps > 4096 || !is_power_of_2(mps)) return -EINVAL; v = ffs(mps) - 8; if (v > dev->pcie_mpss) return -EINVAL; v = FIELD_PREP(PCI_EXP_DEVCTL_PAYLOAD, v); ret = pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_PAYLOAD, v); return pcibios_err_to_errno(ret); } EXPORT_SYMBOL(pcie_set_mps); static enum pci_bus_speed to_pcie_link_speed(u16 lnksta) { return pcie_link_speed[FIELD_GET(PCI_EXP_LNKSTA_CLS, lnksta)]; } int pcie_link_speed_mbps(struct pci_dev *pdev) { u16 lnksta; int err; err = pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta); if (err) return err; return pcie_dev_speed_mbps(to_pcie_link_speed(lnksta)); } EXPORT_SYMBOL(pcie_link_speed_mbps); /** * pcie_bandwidth_available - determine minimum link settings of a PCIe * device and its bandwidth limitation * @dev: PCI device to query * @limiting_dev: storage for device causing the bandwidth limitation * @speed: storage for speed of limiting device * @width: storage for width of limiting device * * Walk up the PCI device chain and find the point where the minimum * bandwidth is available. Return the bandwidth available there and (if * limiting_dev, speed, and width pointers are supplied) information about * that point. The bandwidth returned is in Mb/s, i.e., megabits/second of * raw bandwidth. */ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev, enum pci_bus_speed *speed, enum pcie_link_width *width) { u16 lnksta; enum pci_bus_speed next_speed; enum pcie_link_width next_width; u32 bw, next_bw; if (speed) *speed = PCI_SPEED_UNKNOWN; if (width) *width = PCIE_LNK_WIDTH_UNKNOWN; bw = 0; while (dev) { pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta); next_speed = to_pcie_link_speed(lnksta); next_width = FIELD_GET(PCI_EXP_LNKSTA_NLW, lnksta); next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed); /* Check if current device limits the total bandwidth */ if (!bw || next_bw <= bw) { bw = next_bw; if (limiting_dev) *limiting_dev = dev; if (speed) *speed = next_speed; if (width) *width = next_width; } dev = pci_upstream_bridge(dev); } return bw; } EXPORT_SYMBOL(pcie_bandwidth_available); /** * pcie_get_supported_speeds - query Supported Link Speed Vector * @dev: PCI device to query * * Query @dev supported link speeds. * * Implementation Note in PCIe r6.0 sec 7.5.3.18 recommends determining * supported link speeds using the Supported Link Speeds Vector in the Link * Capabilities 2 Register (when available). * * Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18. * * Without Link Capabilities 2, i.e., prior to PCIe r3.0, Supported Link * Speeds field in Link Capabilities is used and only 2.5 GT/s and 5.0 GT/s * speeds were defined. * * For @dev without Supported Link Speed Vector, the field is synthesized * from the Max Link Speed field in the Link Capabilities Register. * * Return: Supported Link Speeds Vector (+ reserved 0 at LSB). */ u8 pcie_get_supported_speeds(struct pci_dev *dev) { u32 lnkcap2, lnkcap; u8 speeds; /* * Speeds retain the reserved 0 at LSB before PCIe Supported Link * Speeds Vector to allow using SLS Vector bit defines directly. */ pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2); speeds = lnkcap2 & PCI_EXP_LNKCAP2_SLS; /* Ignore speeds higher than Max Link Speed */ pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); speeds &= GENMASK(lnkcap & PCI_EXP_LNKCAP_SLS, 0); /* PCIe r3.0-compliant */ if (speeds) return speeds; /* Synthesize from the Max Link Speed field */ if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_5_0GB) speeds = PCI_EXP_LNKCAP2_SLS_5_0GB | PCI_EXP_LNKCAP2_SLS_2_5GB; else if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_2_5GB) speeds = PCI_EXP_LNKCAP2_SLS_2_5GB; return speeds; } /** * pcie_get_speed_cap - query for the PCI device's link speed capability * @dev: PCI device to query * * Query the PCI device speed capability. * * Return: the maximum link speed supported by the device. */ enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev) { return PCIE_LNKCAP2_SLS2SPEED(dev->supported_speeds); } EXPORT_SYMBOL(pcie_get_speed_cap); /** * pcie_get_width_cap - query for the PCI device's link width capability * @dev: PCI device to query * * Query the PCI device width capability. Return the maximum link width * supported by the device. */ enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev) { u32 lnkcap; pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); if (lnkcap) return FIELD_GET(PCI_EXP_LNKCAP_MLW, lnkcap); return PCIE_LNK_WIDTH_UNKNOWN; } EXPORT_SYMBOL(pcie_get_width_cap); /** * pcie_bandwidth_capable - calculate a PCI device's link bandwidth capability * @dev: PCI device * @speed: storage for link speed * @width: storage for link width * * Calculate a PCI device's link bandwidth by querying for its link speed * and width, multiplying them, and applying encoding overhead. The result * is in Mb/s, i.e., megabits/second of raw bandwidth. */ static u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed, enum pcie_link_width *width) { *speed = pcie_get_speed_cap(dev); *width = pcie_get_width_cap(dev); if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN) return 0; return *width * PCIE_SPEED2MBS_ENC(*speed); } /** * __pcie_print_link_status - Report the PCI device's link speed and width * @dev: PCI device to query * @verbose: Print info even when enough bandwidth is available * * If the available bandwidth at the device is less than the device is * capable of, report the device's maximum possible bandwidth and the * upstream link that limits its performance. If @verbose, always print * the available bandwidth, even if the device isn't constrained. */ void __pcie_print_link_status(struct pci_dev *dev, bool verbose) { enum pcie_link_width width, width_cap; enum pci_bus_speed speed, speed_cap; struct pci_dev *limiting_dev = NULL; u32 bw_avail, bw_cap; char *flit_mode = ""; bw_cap = pcie_bandwidth_capable(dev, &speed_cap, &width_cap); bw_avail = pcie_bandwidth_available(dev, &limiting_dev, &speed, &width); if (dev->bus && dev->bus->flit_mode) flit_mode = ", in Flit mode"; if (bw_avail >= bw_cap && verbose) pci_info(dev, "%u.%03u Gb/s available PCIe bandwidth (%s x%d link)%s\n", bw_cap / 1000, bw_cap % 1000, pci_speed_string(speed_cap), width_cap, flit_mode); else if (bw_avail < bw_cap) pci_info(dev, "%u.%03u Gb/s available PCIe bandwidth, limited by %s x%d link at %s (capable of %u.%03u Gb/s with %s x%d link)%s\n", bw_avail / 1000, bw_avail % 1000, pci_speed_string(speed), width, limiting_dev ? pci_name(limiting_dev) : "<unknown>", bw_cap / 1000, bw_cap % 1000, pci_speed_string(speed_cap), width_cap, flit_mode); } /** * pcie_print_link_status - Report the PCI device's link speed and width * @dev: PCI device to query * * Report the available bandwidth at the device. */ void pcie_print_link_status(struct pci_dev *dev) { __pcie_print_link_status(dev, true); } EXPORT_SYMBOL(pcie_print_link_status); /** * pci_select_bars - Make BAR mask from the type of resource * @dev: the PCI device for which BAR mask is made * @flags: resource type mask to be selected * * This helper routine makes bar mask from the type of resource. */ int pci_select_bars(struct pci_dev *dev, unsigned long flags) { int i, bars = 0; for (i = 0; i < PCI_NUM_RESOURCES; i++) if (pci_resource_flags(dev, i) & flags) bars |= (1 << i); return bars; } EXPORT_SYMBOL(pci_select_bars); /* Some architectures require additional programming to enable VGA */ static arch_set_vga_state_t arch_set_vga_state; void __init pci_register_set_vga_state(arch_set_vga_state_t func) { arch_set_vga_state = func; /* NULL disables */ } static int pci_set_vga_state_arch(struct pci_dev *dev, bool decode, unsigned int command_bits, u32 flags) { if (arch_set_vga_state) return arch_set_vga_state(dev, decode, command_bits, flags); return 0; } /** * pci_set_vga_state - set VGA decode state on device and parents if requested * @dev: the PCI device * @decode: true = enable decoding, false = disable decoding * @command_bits: PCI_COMMAND_IO and/or PCI_COMMAND_MEMORY * @flags: traverse ancestors and change bridges * CHANGE_BRIDGE_ONLY / CHANGE_BRIDGE */ int pci_set_vga_state(struct pci_dev *dev, bool decode, unsigned int command_bits, u32 flags) { struct pci_bus *bus; struct pci_dev *bridge; u16 cmd; int rc; WARN_ON((flags & PCI_VGA_STATE_CHANGE_DECODES) && (command_bits & ~(PCI_COMMAND_IO|PCI_COMMAND_MEMORY))); /* ARCH specific VGA enables */ rc = pci_set_vga_state_arch(dev, decode, command_bits, flags); if (rc) return rc; if (flags & PCI_VGA_STATE_CHANGE_DECODES) { pci_read_config_word(dev, PCI_COMMAND, &cmd); if (decode) cmd |= command_bits; else cmd &= ~command_bits; pci_write_config_word(dev, PCI_COMMAND, cmd); } if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE)) return 0; bus = dev->bus; while (bus) { bridge = bus->self; if (bridge) { pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, &cmd); if (decode) cmd |= PCI_BRIDGE_CTL_VGA; else cmd &= ~PCI_BRIDGE_CTL_VGA; pci_write_config_word(bridge, PCI_BRIDGE_CONTROL, cmd); } bus = bus->parent; } return 0; } #ifdef CONFIG_ACPI bool pci_pr3_present(struct pci_dev *pdev) { struct acpi_device *adev; if (acpi_disabled) return false; adev = ACPI_COMPANION(&pdev->dev); if (!adev) return false; return adev->power.flags.power_resources && acpi_has_method(adev->handle, "_PR3"); } EXPORT_SYMBOL_GPL(pci_pr3_present); #endif /** * pci_add_dma_alias - Add a DMA devfn alias for a device * @dev: the PCI device for which alias is added * @devfn_from: alias slot and function * @nr_devfns: number of subsequent devfns to alias * * This helper encodes an 8-bit devfn as a bit number in dma_alias_mask * which is used to program permissible bus-devfn source addresses for DMA * requests in an IOMMU. These aliases factor into IOMMU group creation * and are useful for devices generating DMA requests beyond or different * from their logical bus-devfn. Examples include device quirks where the * device simply uses the wrong devfn, as well as non-transparent bridges * where the alias may be a proxy for devices in another domain. * * IOMMU group creation is performed during device discovery or addition, * prior to any potential DMA mapping and therefore prior to driver probing * (especially for userspace assigned devices where IOMMU group definition * cannot be left as a userspace activity). DMA aliases should therefore * be configured via quirks, such as the PCI fixup header quirk. */ void pci_add_dma_alias(struct pci_dev *dev, u8 devfn_from, unsigned int nr_devfns) { int devfn_to; nr_devfns = min(nr_devfns, (unsigned int)MAX_NR_DEVFNS - devfn_from); devfn_to = devfn_from + nr_devfns - 1; if (!dev->dma_alias_mask) dev->dma_alias_mask = bitmap_zalloc(MAX_NR_DEVFNS, GFP_KERNEL); if (!dev->dma_alias_mask) { pci_warn(dev, "Unable to allocate DMA alias mask\n"); return; } bitmap_set(dev->dma_alias_mask, devfn_from, nr_devfns); if (nr_devfns == 1) pci_info(dev, "Enabling fixed DMA alias to %02x.%d\n", PCI_SLOT(devfn_from), PCI_FUNC(devfn_from)); else if (nr_devfns > 1) pci_info(dev, "Enabling fixed DMA alias for devfn range from %02x.%d to %02x.%d\n", PCI_SLOT(devfn_from), PCI_FUNC(devfn_from), PCI_SLOT(devfn_to), PCI_FUNC(devfn_to)); } bool pci_devs_are_dma_aliases(struct pci_dev *dev1, struct pci_dev *dev2) { return (dev1->dma_alias_mask && test_bit(dev2->devfn, dev1->dma_alias_mask)) || (dev2->dma_alias_mask && test_bit(dev1->devfn, dev2->dma_alias_mask)) || pci_real_dma_dev(dev1) == dev2 || pci_real_dma_dev(dev2) == dev1; } bool pci_device_is_present(struct pci_dev *pdev) { u32 v; /* Check PF if pdev is a VF, since VF Vendor/Device IDs are 0xffff */ pdev = pci_physfn(pdev); if (pci_dev_is_disconnected(pdev)) return false; return pci_bus_read_dev_vendor_id(pdev->bus, pdev->devfn, &v, 0); } EXPORT_SYMBOL_GPL(pci_device_is_present); void pci_ignore_hotplug(struct pci_dev *dev) { struct pci_dev *bridge = dev->bus->self; dev->ignore_hotplug = 1; /* Propagate the "ignore hotplug" setting to the parent bridge. */ if (bridge) bridge->ignore_hotplug = 1; } EXPORT_SYMBOL_GPL(pci_ignore_hotplug); /** * pci_real_dma_dev - Get PCI DMA device for PCI device * @dev: the PCI device that may have a PCI DMA alias * * Permits the platform to provide architecture-specific functionality to * devices needing to alias DMA to another PCI device on another PCI bus. If * the PCI device is on the same bus, it is recommended to use * pci_add_dma_alias(). This is the default implementation. Architecture * implementations can override this. */ struct pci_dev __weak *pci_real_dma_dev(struct pci_dev *dev) { return dev; } resource_size_t __weak pcibios_default_alignment(void) { return 0; } /* * Arches that don't want to expose struct resource to userland as-is in * sysfs and /proc can implement their own pci_resource_to_user(). */ void __weak pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end) { *start = rsrc->start; *end = rsrc->end; } static char *resource_alignment_param; static DEFINE_SPINLOCK(resource_alignment_lock); /** * pci_specified_resource_alignment - get resource alignment specified by user. * @dev: the PCI device to get * @resize: whether or not to change resources' size when reassigning alignment * * RETURNS: Resource alignment if it is specified. * Zero if it is not specified. */ static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev, bool *resize) { int align_order, count; resource_size_t align = pcibios_default_alignment(); const char *p; int ret; spin_lock(&resource_alignment_lock); p = resource_alignment_param; if (!p || !*p) goto out; if (pci_has_flag(PCI_PROBE_ONLY)) { align = 0; pr_info_once("PCI: Ignoring requested alignments (PCI_PROBE_ONLY)\n"); goto out; } while (*p) { count = 0; if (sscanf(p, "%d%n", &align_order, &count) == 1 && p[count] == '@') { p += count + 1; if (align_order > 63) { pr_err("PCI: Invalid requested alignment (order %d)\n", align_order); align_order = PAGE_SHIFT; } } else { align_order = PAGE_SHIFT; } ret = pci_dev_str_match(dev, p, &p); if (ret == 1) { *resize = true; align = 1ULL << align_order; break; } else if (ret < 0) { pr_err("PCI: Can't parse resource_alignment parameter: %s\n", p); break; } if (*p != ';' && *p != ',') { /* End of param or invalid format */ break; } p++; } out: spin_unlock(&resource_alignment_lock); return align; } static void pci_request_resource_alignment(struct pci_dev *dev, int bar, resource_size_t align, bool resize) { struct resource *r = &dev->resource[bar]; const char *r_name = pci_resource_name(dev, bar); resource_size_t size; if (!(r->flags & IORESOURCE_MEM)) return; if (r->flags & IORESOURCE_PCI_FIXED) { pci_info(dev, "%s %pR: ignoring requested alignment %#llx\n", r_name, r, (unsigned long long)align); return; } size = resource_size(r); if (size >= align) return; /* * Increase the alignment of the resource. There are two ways we * can do this: * * 1) Increase the size of the resource. BARs are aligned on their * size, so when we reallocate space for this resource, we'll * allocate it with the larger alignment. This also prevents * assignment of any other BARs inside the alignment region, so * if we're requesting page alignment, this means no other BARs * will share the page. * * The disadvantage is that this makes the resource larger than * the hardware BAR, which may break drivers that compute things * based on the resource size, e.g., to find registers at a * fixed offset before the end of the BAR. * * 2) Retain the resource size, but use IORESOURCE_STARTALIGN and * set r->start to the desired alignment. By itself this * doesn't prevent other BARs being put inside the alignment * region, but if we realign *every* resource of every device in * the system, none of them will share an alignment region. * * When the user has requested alignment for only some devices via * the "pci=resource_alignment" argument, "resize" is true and we * use the first method. Otherwise we assume we're aligning all * devices and we use the second. */ pci_info(dev, "%s %pR: requesting alignment to %#llx\n", r_name, r, (unsigned long long)align); if (resize) { r->start = 0; r->end = align - 1; } else { r->flags &= ~IORESOURCE_SIZEALIGN; r->flags |= IORESOURCE_STARTALIGN; resource_set_range(r, align, size); } r->flags |= IORESOURCE_UNSET; } /* * This function disables memory decoding and releases memory resources * of the device specified by kernel's boot parameter 'pci=resource_alignment='. * It also rounds up size to specified alignment. * Later on, the kernel will assign page-aligned memory resource back * to the device. */ void pci_reassigndev_resource_alignment(struct pci_dev *dev) { int i; struct resource *r; resource_size_t align; u16 command; bool resize = false; /* * VF BARs are read-only zero according to SR-IOV spec r1.1, sec * 3.4.1.11. Their resources are allocated from the space * described by the VF BARx register in the PF's SR-IOV capability. * We can't influence their alignment here. */ if (dev->is_virtfn) return; /* check if specified PCI is target device to reassign */ align = pci_specified_resource_alignment(dev, &resize); if (!align) return; if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL && (dev->class >> 8) == PCI_CLASS_BRIDGE_HOST) { pci_warn(dev, "Can't reassign resources to host bridge\n"); return; } pci_read_config_word(dev, PCI_COMMAND, &command); command &= ~PCI_COMMAND_MEMORY; pci_write_config_word(dev, PCI_COMMAND, command); for (i = 0; i <= PCI_ROM_RESOURCE; i++) pci_request_resource_alignment(dev, i, align, resize); /* * Need to disable bridge's resource window, * to enable the kernel to reassign new resource * window later on. */ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) { r = &dev->resource[i]; if (!(r->flags & IORESOURCE_MEM)) continue; r->flags |= IORESOURCE_UNSET; r->end = resource_size(r) - 1; r->start = 0; } pci_disable_bridge_window(dev); } } static ssize_t resource_alignment_show(const struct bus_type *bus, char *buf) { size_t count = 0; spin_lock(&resource_alignment_lock); if (resource_alignment_param) count = sysfs_emit(buf, "%s\n", resource_alignment_param); spin_unlock(&resource_alignment_lock); return count; } static ssize_t resource_alignment_store(const struct bus_type *bus, const char *buf, size_t count) { char *param, *old, *end; if (count >= (PAGE_SIZE - 1)) return -EINVAL; param = kstrndup(buf, count, GFP_KERNEL); if (!param) return -ENOMEM; end = strchr(param, '\n'); if (end) *end = '\0'; spin_lock(&resource_alignment_lock); old = resource_alignment_param; if (strlen(param)) { resource_alignment_param = param; } else { kfree(param); resource_alignment_param = NULL; } spin_unlock(&resource_alignment_lock); kfree(old); return count; } static BUS_ATTR_RW(resource_alignment); static int __init pci_resource_alignment_sysfs_init(void) { return bus_create_file(&pci_bus_type, &bus_attr_resource_alignment); } late_initcall(pci_resource_alignment_sysfs_init); static void pci_no_domains(void) { #ifdef CONFIG_PCI_DOMAINS pci_domains_supported = 0; #endif } #ifdef CONFIG_PCI_DOMAINS_GENERIC static DEFINE_IDA(pci_domain_nr_static_ida); static DEFINE_IDA(pci_domain_nr_dynamic_ida); static void of_pci_reserve_static_domain_nr(void) { struct device_node *np; int domain_nr; for_each_node_by_type(np, "pci") { domain_nr = of_get_pci_domain_nr(np); if (domain_nr < 0) continue; /* * Permanently allocate domain_nr in dynamic_ida * to prevent it from dynamic allocation. */ ida_alloc_range(&pci_domain_nr_dynamic_ida, domain_nr, domain_nr, GFP_KERNEL); } } static int of_pci_bus_find_domain_nr(struct device *parent) { static bool static_domains_reserved = false; int domain_nr; /* On the first call scan device tree for static allocations. */ if (!static_domains_reserved) { of_pci_reserve_static_domain_nr(); static_domains_reserved = true; } if (parent) { /* * If domain is in DT, allocate it in static IDA. This * prevents duplicate static allocations in case of errors * in DT. */ domain_nr = of_get_pci_domain_nr(parent->of_node); if (domain_nr >= 0) return ida_alloc_range(&pci_domain_nr_static_ida, domain_nr, domain_nr, GFP_KERNEL); } /* * If domain was not specified in DT, choose a free ID from dynamic * allocations. All domain numbers from DT are permanently in * dynamic allocations to prevent assigning them to other DT nodes * without static domain. */ return ida_alloc(&pci_domain_nr_dynamic_ida, GFP_KERNEL); } static void of_pci_bus_release_domain_nr(struct device *parent, int domain_nr) { if (domain_nr < 0) return; /* Release domain from IDA where it was allocated. */ if (of_get_pci_domain_nr(parent->of_node) == domain_nr) ida_free(&pci_domain_nr_static_ida, domain_nr); else ida_free(&pci_domain_nr_dynamic_ida, domain_nr); } int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent) { return acpi_disabled ? of_pci_bus_find_domain_nr(parent) : acpi_pci_bus_find_domain_nr(bus); } void pci_bus_release_domain_nr(struct device *parent, int domain_nr) { if (!acpi_disabled) return; of_pci_bus_release_domain_nr(parent, domain_nr); } #endif /** * pci_ext_cfg_avail - can we access extended PCI config space? * * Returns 1 if we can access PCI extended config space (offsets * greater than 0xff). This is the default implementation. Architecture * implementations can override this. */ int __weak pci_ext_cfg_avail(void) { return 1; } static int __init pci_setup(char *str) { while (str) { char *k = strchr(str, ','); if (k) *k++ = 0; if (*str && (str = pcibios_setup(str)) && *str) { if (!strcmp(str, "nomsi")) { pci_no_msi(); } else if (!strncmp(str, "noats", 5)) { pr_info("PCIe: ATS is disabled\n"); pcie_ats_disabled = true; } else if (!strcmp(str, "noaer")) { pci_no_aer(); } else if (!strcmp(str, "earlydump")) { pci_early_dump = true; } else if (!strncmp(str, "realloc=", 8)) { pci_realloc_get_opt(str + 8); } else if (!strncmp(str, "realloc", 7)) { pci_realloc_get_opt("on"); } else if (!strcmp(str, "nodomains")) { pci_no_domains(); } else if (!strncmp(str, "noari", 5)) { pcie_ari_disabled = true; } else if (!strncmp(str, "notph", 5)) { pci_no_tph(); } else if (!strncmp(str, "cbiosize=", 9)) { pci_cardbus_io_size = memparse(str + 9, &str); } else if (!strncmp(str, "cbmemsize=", 10)) { pci_cardbus_mem_size = memparse(str + 10, &str); } else if (!strncmp(str, "resource_alignment=", 19)) { resource_alignment_param = str + 19; } else if (!strncmp(str, "ecrc=", 5)) { pcie_ecrc_get_policy(str + 5); } else if (!strncmp(str, "hpiosize=", 9)) { pci_hotplug_io_size = memparse(str + 9, &str); } else if (!strncmp(str, "hpmmiosize=", 11)) { pci_hotplug_mmio_size = memparse(str + 11, &str); } else if (!strncmp(str, "hpmmioprefsize=", 15)) { pci_hotplug_mmio_pref_size = memparse(str + 15, &str); } else if (!strncmp(str, "hpmemsize=", 10)) { pci_hotplug_mmio_size = memparse(str + 10, &str); pci_hotplug_mmio_pref_size = pci_hotplug_mmio_size; } else if (!strncmp(str, "hpbussize=", 10)) { pci_hotplug_bus_size = simple_strtoul(str + 10, &str, 0); if (pci_hotplug_bus_size > 0xff) pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE; } else if (!strncmp(str, "pcie_bus_tune_off", 17)) { pcie_bus_config = PCIE_BUS_TUNE_OFF; } else if (!strncmp(str, "pcie_bus_safe", 13)) { pcie_bus_config = PCIE_BUS_SAFE; } else if (!strncmp(str, "pcie_bus_perf", 13)) { pcie_bus_config = PCIE_BUS_PERFORMANCE; } else if (!strncmp(str, "pcie_bus_peer2peer", 18)) { pcie_bus_config = PCIE_BUS_PEER2PEER; } else if (!strncmp(str, "pcie_scan_all", 13)) { pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS); } else if (!strncmp(str, "disable_acs_redir=", 18)) { disable_acs_redir_param = str + 18; } else if (!strncmp(str, "config_acs=", 11)) { config_acs_param = str + 11; } else { pr_err("PCI: Unknown option `%s'\n", str); } } str = k; } return 0; } early_param("pci", pci_setup); /* * 'resource_alignment_param' and 'disable_acs_redir_param' are initialized * in pci_setup(), above, to point to data in the __initdata section which * will be freed after the init sequence is complete. We can't allocate memory * in pci_setup() because some architectures do not have any memory allocation * service available during an early_param() call. So we allocate memory and * copy the variable here before the init section is freed. * */ static int __init pci_realloc_setup_params(void) { resource_alignment_param = kstrdup(resource_alignment_param, GFP_KERNEL); disable_acs_redir_param = kstrdup(disable_acs_redir_param, GFP_KERNEL); config_acs_param = kstrdup(config_acs_param, GFP_KERNEL); return 0; } pure_initcall(pci_realloc_setup_params); |
| 1 1 4 4 4 4 1 2 3 2 2 5 4 5 5 5 5 5 5 5 2 2 2 2 2 2 2 1 2 2 2 1 1 1 4 3 1 1 4 3 3 3 2 2 2 2 2 2 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */ #include <linux/bpf.h> #include <linux/skmsg.h> #include <net/af_unix.h> #include "af_unix.h" #define unix_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ !skb_queue_empty(&__psock->ingress_skb) || \ !list_empty(&__psock->ingress_msg); \ }) static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct unix_sock *u = unix_sk(sk); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); if (!unix_sk_has_data(sk, psock)) { mutex_unlock(&u->iolock); wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); mutex_lock(&u->iolock); ret = unix_sk_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int __unix_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { if (sk->sk_type == SOCK_DGRAM) return __unix_dgram_recvmsg(sk, msg, len, flags); else return __unix_stream_recvmsg(sk, msg, len, flags); } static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct unix_sock *u = unix_sk(sk); struct sk_psock *psock; int copied; if (flags & MSG_OOB) return -EOPNOTSUPP; if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return __unix_recvmsg(sk, msg, len, flags); mutex_lock(&u->iolock); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return __unix_recvmsg(sk, msg, len, flags); } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = unix_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return __unix_recvmsg(sk, msg, len, flags); } copied = -EAGAIN; } mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return copied; } static struct proto *unix_dgram_prot_saved __read_mostly; static DEFINE_SPINLOCK(unix_dgram_prot_lock); static struct proto unix_dgram_bpf_prot; static struct proto *unix_stream_prot_saved __read_mostly; static DEFINE_SPINLOCK(unix_stream_prot_lock); static struct proto unix_stream_bpf_prot; static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void unix_stream_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; prot->unhash = sock_map_unhash; } static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) { spin_lock_bh(&unix_dgram_prot_lock); if (likely(ops != unix_dgram_prot_saved)) { unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops); smp_store_release(&unix_dgram_prot_saved, ops); } spin_unlock_bh(&unix_dgram_prot_lock); } } static void unix_stream_bpf_check_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) { spin_lock_bh(&unix_stream_prot_lock); if (likely(ops != unix_stream_prot_saved)) { unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops); smp_store_release(&unix_stream_prot_saved, ops); } spin_unlock_bh(&unix_stream_prot_lock); } } int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { if (sk->sk_type != SOCK_DGRAM) return -EOPNOTSUPP; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } unix_dgram_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &unix_dgram_bpf_prot); return 0; } int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { struct sock *sk_pair; /* Restore does not decrement the sk_pair reference yet because we must * keep the a reference to the socket until after an RCU grace period * and any pending sends have completed. */ if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } /* psock_update_sk_prot can be called multiple times if psock is * added to multiple maps and/or slots in the same map. There is * also an edge case where replacing a psock with itself can trigger * an extra psock_update_sk_prot during the insert process. So it * must be safe to do multiple calls. Here we need to ensure we don't * increment the refcnt through sock_hold many times. There will only * be a single matching destroy operation. */ if (!psock->sk_pair) { sk_pair = unix_peer(sk); sock_hold(sk_pair); psock->sk_pair = sk_pair; } unix_stream_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &unix_stream_bpf_prot); return 0; } void __init unix_bpf_build_proto(void) { unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto); unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto); } |
| 59 59 59 59 59 59 59 58 61 60 60 60 27 59 58 59 58 59 59 59 24 57 5 58 58 58 60 60 60 31 55 55 55 35 35 35 8 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 | // SPDX-License-Identifier: MIT /* * Copyright 2018 Noralf Trønnes * Copyright (c) 2006-2009 Red Hat Inc. * Copyright (c) 2006-2008 Intel Corporation * Jesse Barnes <jesse.barnes@intel.com> * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> */ #include "drm/drm_modeset_lock.h" #include <linux/export.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/string_helpers.h> #include <drm/drm_atomic.h> #include <drm/drm_client.h> #include <drm/drm_connector.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" #define DRM_CLIENT_MAX_CLONED_CONNECTORS 8 struct drm_client_offset { int x, y; }; int drm_client_modeset_create(struct drm_client_dev *client) { struct drm_device *dev = client->dev; unsigned int num_crtc = dev->mode_config.num_crtc; unsigned int max_connector_count = 1; struct drm_mode_set *modeset; struct drm_crtc *crtc; int i = 0; /* Add terminating zero entry to enable index less iteration */ client->modesets = kcalloc(num_crtc + 1, sizeof(*client->modesets), GFP_KERNEL); if (!client->modesets) return -ENOMEM; mutex_init(&client->modeset_mutex); drm_for_each_crtc(crtc, dev) client->modesets[i++].crtc = crtc; /* Cloning is only supported in the single crtc case. */ if (num_crtc == 1) max_connector_count = DRM_CLIENT_MAX_CLONED_CONNECTORS; for (modeset = client->modesets; modeset->crtc; modeset++) { modeset->connectors = kcalloc(max_connector_count, sizeof(*modeset->connectors), GFP_KERNEL); if (!modeset->connectors) goto err_free; } return 0; err_free: drm_client_modeset_free(client); return -ENOMEM; } static void drm_client_modeset_release(struct drm_client_dev *client) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) { int i; drm_mode_destroy(client->dev, modeset->mode); modeset->mode = NULL; modeset->fb = NULL; for (i = 0; i < modeset->num_connectors; i++) { drm_connector_put(modeset->connectors[i]); modeset->connectors[i] = NULL; } modeset->num_connectors = 0; } } void drm_client_modeset_free(struct drm_client_dev *client) { struct drm_mode_set *modeset; mutex_lock(&client->modeset_mutex); drm_client_modeset_release(client); drm_client_for_each_modeset(modeset, client) kfree(modeset->connectors); mutex_unlock(&client->modeset_mutex); mutex_destroy(&client->modeset_mutex); kfree(client->modesets); } static struct drm_mode_set * drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) if (modeset->crtc == crtc) return modeset; return NULL; } static const struct drm_display_mode * drm_connector_get_tiled_mode(struct drm_connector *connector) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) return mode; } return NULL; } static const struct drm_display_mode * drm_connector_fallback_non_tiled_mode(struct drm_connector *connector) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) continue; return mode; } return NULL; } static const struct drm_display_mode * drm_connector_preferred_mode(struct drm_connector *connector, int width, int height) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay > width || mode->vdisplay > height) continue; if (mode->type & DRM_MODE_TYPE_PREFERRED) return mode; } return NULL; } static const struct drm_display_mode * drm_connector_first_mode(struct drm_connector *connector) { return list_first_entry_or_null(&connector->modes, struct drm_display_mode, head); } static const struct drm_display_mode * drm_connector_pick_cmdline_mode(struct drm_connector *connector) { const struct drm_cmdline_mode *cmdline_mode; const struct drm_display_mode *mode; bool prefer_non_interlace; /* * Find a user-defined mode. If the user gave us a valid * mode on the kernel command line, it will show up in this * list. */ list_for_each_entry(mode, &connector->modes, head) { if (mode->type & DRM_MODE_TYPE_USERDEF) return mode; } cmdline_mode = &connector->cmdline_mode; if (cmdline_mode->specified == false) return NULL; /* * Attempt to find a matching mode in the list of modes we * have gotten so far. */ prefer_non_interlace = !cmdline_mode->interlace; again: list_for_each_entry(mode, &connector->modes, head) { /* check width/height */ if (mode->hdisplay != cmdline_mode->xres || mode->vdisplay != cmdline_mode->yres) continue; if (cmdline_mode->refresh_specified) { if (drm_mode_vrefresh(mode) != cmdline_mode->refresh) continue; } if (cmdline_mode->interlace) { if (!(mode->flags & DRM_MODE_FLAG_INTERLACE)) continue; } else if (prefer_non_interlace) { if (mode->flags & DRM_MODE_FLAG_INTERLACE) continue; } return mode; } if (prefer_non_interlace) { prefer_non_interlace = false; goto again; } return NULL; } static bool drm_connector_enabled(struct drm_connector *connector, bool strict) { bool enable; if (connector->display_info.non_desktop) return false; if (strict) enable = connector->status == connector_status_connected; else enable = connector->status != connector_status_disconnected; return enable; } static void drm_client_connectors_enabled(struct drm_connector *connectors[], unsigned int connector_count, bool enabled[]) { bool any_enabled = false; struct drm_connector *connector; int i = 0; for (i = 0; i < connector_count; i++) { connector = connectors[i]; enabled[i] = drm_connector_enabled(connector, true); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] enabled? %s\n", connector->base.id, connector->name, connector->display_info.non_desktop ? "non desktop" : str_yes_no(enabled[i])); any_enabled |= enabled[i]; } if (any_enabled) return; for (i = 0; i < connector_count; i++) enabled[i] = drm_connector_enabled(connectors[i], false); } static void mode_replace(struct drm_device *dev, const struct drm_display_mode **dst, const struct drm_display_mode *src) { drm_mode_destroy(dev, (struct drm_display_mode *)*dst); *dst = src ? drm_mode_duplicate(dev, src) : NULL; } static void modes_destroy(struct drm_device *dev, const struct drm_display_mode *modes[], int count) { int i; for (i = 0; i < count; i++) mode_replace(dev, &modes[i], NULL); } static bool drm_client_target_cloned(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { int count, i; bool can_clone = false; struct drm_display_mode *dmt_mode; /* only contemplate cloning in the single crtc case */ if (dev->mode_config.num_crtc > 1) return false; count = 0; for (i = 0; i < connector_count; i++) { if (enabled[i]) count++; } /* only contemplate cloning if more than one connector is enabled */ if (count <= 1) return false; /* check the command line or if nothing common pick 1024x768 */ can_clone = true; for (i = 0; i < connector_count; i++) { int j; if (!enabled[i]) continue; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connectors[i])); if (!modes[i]) { can_clone = false; break; } for (j = 0; j < i; j++) { if (!enabled[j]) continue; if (!drm_mode_match(modes[j], modes[i], DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) can_clone = false; } } if (can_clone) { drm_dbg_kms(dev, "can clone using command line\n"); return true; } /* try and find a 1024x768 mode on each connector */ can_clone = true; dmt_mode = drm_mode_find_dmt(dev, 1024, 768, 60, false); if (!dmt_mode) goto fail; for (i = 0; i < connector_count; i++) { const struct drm_display_mode *mode; if (!enabled[i]) continue; list_for_each_entry(mode, &connectors[i]->modes, head) { if (drm_mode_match(mode, dmt_mode, DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) mode_replace(dev, &modes[i], mode); } if (!modes[i]) can_clone = false; } drm_mode_destroy(dev, dmt_mode); if (can_clone) { drm_dbg_kms(dev, "can clone using 1024x768\n"); return true; } fail: drm_info(dev, "kms: can't enable cloning when we probably wanted to.\n"); return false; } static int drm_client_get_tile_offsets(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], int idx, int h_idx, int v_idx) { int i; int hoffset = 0, voffset = 0; for (i = 0; i < connector_count; i++) { struct drm_connector *connector = connectors[i]; if (!connector->has_tile) continue; if (!modes[i] && (h_idx || v_idx)) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] no modes for connector tiled %d\n", connector->base.id, connector->name, i); continue; } if (connector->tile_h_loc < h_idx) hoffset += modes[i]->hdisplay; if (connector->tile_v_loc < v_idx) voffset += modes[i]->vdisplay; } offsets[idx].x = hoffset; offsets[idx].y = voffset; drm_dbg_kms(dev, "returned %d %d for %d %d\n", hoffset, voffset, h_idx, v_idx); return 0; } static bool drm_client_target_preferred(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { const u64 mask = BIT_ULL(connector_count) - 1; u64 conn_configured = 0; int tile_pass = 0; int num_tiled_conns = 0; int i; for (i = 0; i < connector_count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: for (i = 0; i < connector_count; i++) { struct drm_connector *connector = connectors[i]; const char *mode_type; if (conn_configured & BIT_ULL(i)) continue; if (enabled[i] == false) { conn_configured |= BIT_ULL(i); continue; } /* first pass over all the untiled connectors */ if (tile_pass == 0 && connector->has_tile) continue; if (tile_pass == 1) { if (connector->tile_h_loc != 0 || connector->tile_v_loc != 0) continue; } else { if (connector->tile_h_loc != tile_pass - 1 && connector->tile_v_loc != tile_pass - 1) /* if this tile_pass doesn't cover any of the tiles - keep going */ continue; /* * find the tile offsets for this pass - need to find * all tiles left and above */ drm_client_get_tile_offsets(dev, connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } mode_type = "cmdline"; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connector)); if (!modes[i]) { mode_type = "preferred"; mode_replace(dev, &modes[i], drm_connector_preferred_mode(connector, width, height)); } if (!modes[i]) { mode_type = "first"; mode_replace(dev, &modes[i], drm_connector_first_mode(connector)); } /* * In case of tiled mode if all tiles not present fallback to * first available non tiled mode. * After all tiles are present, try to find the tiled mode * for all and if tiled mode not present due to fbcon size * limitations, use first non tiled mode only for * tile 0,0 and set to no mode for all other tiles. */ if (connector->has_tile) { if (num_tiled_conns < connector->num_h_tile * connector->num_v_tile || (connector->tile_h_loc == 0 && connector->tile_v_loc == 0 && !drm_connector_get_tiled_mode(connector))) { mode_type = "non tiled"; mode_replace(dev, &modes[i], drm_connector_fallback_non_tiled_mode(connector)); } else { mode_type = "tiled"; mode_replace(dev, &modes[i], drm_connector_get_tiled_mode(connector)); } } if (modes[i]) drm_dbg_kms(dev, "[CONNECTOR:%d:%s] found %s mode: %s\n", connector->base.id, connector->name, mode_type, modes[i]->name); else drm_dbg_kms(dev, "[CONNECTOR:%d:%s] no mode found\n", connector->base.id, connector->name); conn_configured |= BIT_ULL(i); } if ((conn_configured & mask) != mask) { tile_pass++; goto retry; } return true; } static bool connector_has_possible_crtc(struct drm_connector *connector, struct drm_crtc *crtc) { struct drm_encoder *encoder; drm_connector_for_each_possible_encoder(connector, encoder) { if (encoder->possible_crtcs & drm_crtc_mask(crtc)) return true; } return false; } static int drm_client_pick_crtcs(struct drm_client_dev *client, struct drm_connector *connectors[], unsigned int connector_count, struct drm_crtc *best_crtcs[], const struct drm_display_mode *modes[], int n, int width, int height) { struct drm_device *dev = client->dev; struct drm_connector *connector; int my_score, best_score, score; struct drm_crtc **crtcs; struct drm_mode_set *modeset; if (n == connector_count) return 0; connector = connectors[n]; best_crtcs[n] = NULL; best_score = drm_client_pick_crtcs(client, connectors, connector_count, best_crtcs, modes, n + 1, width, height); if (modes[n] == NULL) return best_score; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); if (!crtcs) return best_score; my_score = 1; if (connector->status == connector_status_connected) my_score++; if (connector->cmdline_mode.specified) my_score++; if (drm_connector_preferred_mode(connector, width, height)) my_score++; /* * select a crtc for this connector and then attempt to configure * remaining connectors */ drm_client_for_each_modeset(modeset, client) { struct drm_crtc *crtc = modeset->crtc; int o; if (!connector_has_possible_crtc(connector, crtc)) continue; for (o = 0; o < n; o++) if (best_crtcs[o] == crtc) break; if (o < n) { /* ignore cloning unless only a single crtc */ if (dev->mode_config.num_crtc > 1) continue; if (!drm_mode_equal(modes[o], modes[n])) continue; } crtcs[n] = crtc; memcpy(crtcs, best_crtcs, n * sizeof(*crtcs)); score = my_score + drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, n + 1, width, height); if (score > best_score) { best_score = score; memcpy(best_crtcs, crtcs, connector_count * sizeof(*crtcs)); } } kfree(crtcs); return best_score; } /* Try to read the BIOS display configuration and use it for the initial config */ static bool drm_client_firmware_config(struct drm_client_dev *client, struct drm_connector *connectors[], unsigned int connector_count, struct drm_crtc *crtcs[], const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { const int count = min_t(unsigned int, connector_count, BITS_PER_LONG); unsigned long conn_configured, conn_seq, mask; struct drm_device *dev = client->dev; int i; bool *save_enabled; bool fallback = true, ret = true; int num_connectors_enabled = 0; int num_connectors_detected = 0; int num_tiled_conns = 0; struct drm_modeset_acquire_ctx ctx; if (!drm_drv_uses_atomic_modeset(dev)) return false; if (drm_WARN_ON(dev, count <= 0)) return false; save_enabled = kcalloc(count, sizeof(bool), GFP_KERNEL); if (!save_enabled) return false; drm_modeset_acquire_init(&ctx, 0); while (drm_modeset_lock_all_ctx(dev, &ctx) != 0) drm_modeset_backoff(&ctx); memcpy(save_enabled, enabled, count); mask = GENMASK(count - 1, 0); conn_configured = 0; for (i = 0; i < count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: conn_seq = conn_configured; for (i = 0; i < count; i++) { struct drm_connector *connector = connectors[i]; struct drm_encoder *encoder; struct drm_crtc *crtc; const char *mode_type; int j; if (conn_configured & BIT(i)) continue; if (conn_seq == 0 && !connector->has_tile) continue; if (connector->status == connector_status_connected) num_connectors_detected++; if (!enabled[i]) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] not enabled, skipping\n", connector->base.id, connector->name); conn_configured |= BIT(i); continue; } if (connector->force == DRM_FORCE_OFF) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] disabled by user, skipping\n", connector->base.id, connector->name); enabled[i] = false; continue; } encoder = connector->state->best_encoder; if (!encoder || drm_WARN_ON(dev, !connector->state->crtc)) { if (connector->force > DRM_FORCE_OFF) goto bail; drm_dbg_kms(dev, "[CONNECTOR:%d:%s] has no encoder or crtc, skipping\n", connector->base.id, connector->name); enabled[i] = false; conn_configured |= BIT(i); continue; } num_connectors_enabled++; crtc = connector->state->crtc; /* * Make sure we're not trying to drive multiple connectors * with a single CRTC, since our cloning support may not * match the BIOS. */ for (j = 0; j < count; j++) { if (crtcs[j] == crtc) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] fallback: cloned configuration\n", connector->base.id, connector->name); goto bail; } } mode_type = "cmdline"; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connector)); if (!modes[i]) { mode_type = "preferred"; mode_replace(dev, &modes[i], drm_connector_preferred_mode(connector, width, height)); } if (!modes[i]) { mode_type = "first"; mode_replace(dev, &modes[i], drm_connector_first_mode(connector)); } /* last resort: use current mode */ if (!modes[i]) { mode_type = "current"; mode_replace(dev, &modes[i], &crtc->state->mode); } /* * In case of tiled modes, if all tiles are not present * then fallback to a non tiled mode. */ if (connector->has_tile && num_tiled_conns < connector->num_h_tile * connector->num_v_tile) { mode_type = "non tiled"; mode_replace(dev, &modes[i], drm_connector_fallback_non_tiled_mode(connector)); } crtcs[i] = crtc; drm_dbg_kms(dev, "[CONNECTOR::%d:%s] on [CRTC:%d:%s] using %s mode: %s\n", connector->base.id, connector->name, crtc->base.id, crtc->name, mode_type, modes[i]->name); fallback = false; conn_configured |= BIT(i); } if ((conn_configured & mask) != mask && conn_configured != conn_seq) goto retry; for (i = 0; i < count; i++) { struct drm_connector *connector = connectors[i]; if (connector->has_tile) drm_client_get_tile_offsets(dev, connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } /* * If the BIOS didn't enable everything it could, fall back to have the * same user experiencing of lighting up as much as possible like the * fbdev helper library. */ if (num_connectors_enabled != num_connectors_detected && num_connectors_enabled < dev->mode_config.num_crtc) { drm_dbg_kms(dev, "fallback: Not all outputs enabled\n"); drm_dbg_kms(dev, "Enabled: %i, detected: %i\n", num_connectors_enabled, num_connectors_detected); fallback = true; } if (fallback) { bail: drm_dbg_kms(dev, "Not using firmware configuration\n"); memcpy(enabled, save_enabled, count); ret = false; } drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); kfree(save_enabled); return ret; } /** * drm_client_modeset_probe() - Probe for displays * @client: DRM client * @width: Maximum display mode width (optional) * @height: Maximum display mode height (optional) * * This function sets up display pipelines for enabled connectors and stores the * config in the client's modeset array. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height) { struct drm_connector *connector, **connectors = NULL; struct drm_connector_list_iter conn_iter; struct drm_device *dev = client->dev; unsigned int total_modes_count = 0; struct drm_client_offset *offsets; unsigned int connector_count = 0; const struct drm_display_mode **modes; struct drm_crtc **crtcs; int i, ret = 0; bool *enabled; drm_dbg_kms(dev, "\n"); if (!width) width = dev->mode_config.max_width; if (!height) height = dev->mode_config.max_height; drm_connector_list_iter_begin(dev, &conn_iter); drm_client_for_each_connector_iter(connector, &conn_iter) { struct drm_connector **tmp; tmp = krealloc(connectors, (connector_count + 1) * sizeof(*connectors), GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto free_connectors; } connectors = tmp; drm_connector_get(connector); connectors[connector_count++] = connector; } drm_connector_list_iter_end(&conn_iter); if (!connector_count) return 0; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); modes = kcalloc(connector_count, sizeof(*modes), GFP_KERNEL); offsets = kcalloc(connector_count, sizeof(*offsets), GFP_KERNEL); enabled = kcalloc(connector_count, sizeof(bool), GFP_KERNEL); if (!crtcs || !modes || !enabled || !offsets) { ret = -ENOMEM; goto out; } mutex_lock(&client->modeset_mutex); mutex_lock(&dev->mode_config.mutex); for (i = 0; i < connector_count; i++) total_modes_count += connectors[i]->funcs->fill_modes(connectors[i], width, height); if (!total_modes_count) drm_dbg_kms(dev, "No connectors reported connected with modes\n"); drm_client_connectors_enabled(connectors, connector_count, enabled); if (!drm_client_firmware_config(client, connectors, connector_count, crtcs, modes, offsets, enabled, width, height)) { modes_destroy(dev, modes, connector_count); memset(crtcs, 0, connector_count * sizeof(*crtcs)); memset(offsets, 0, connector_count * sizeof(*offsets)); if (!drm_client_target_cloned(dev, connectors, connector_count, modes, offsets, enabled, width, height) && !drm_client_target_preferred(dev, connectors, connector_count, modes, offsets, enabled, width, height)) drm_err(dev, "Unable to find initial modes\n"); drm_dbg_kms(dev, "picking CRTCs for %dx%d config\n", width, height); drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, 0, width, height); } mutex_unlock(&dev->mode_config.mutex); drm_client_modeset_release(client); for (i = 0; i < connector_count; i++) { const struct drm_display_mode *mode = modes[i]; struct drm_crtc *crtc = crtcs[i]; struct drm_client_offset *offset = &offsets[i]; if (mode && crtc) { struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc); struct drm_connector *connector = connectors[i]; drm_dbg_kms(dev, "[CRTC:%d:%s] desired mode %s set (%d,%d)\n", crtc->base.id, crtc->name, mode->name, offset->x, offset->y); if (drm_WARN_ON_ONCE(dev, modeset->num_connectors == DRM_CLIENT_MAX_CLONED_CONNECTORS || (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1))) { ret = -EINVAL; break; } drm_mode_destroy(dev, modeset->mode); modeset->mode = drm_mode_duplicate(dev, mode); if (!modeset->mode) { ret = -ENOMEM; break; } drm_connector_get(connector); modeset->connectors[modeset->num_connectors++] = connector; modeset->x = offset->x; modeset->y = offset->y; } } mutex_unlock(&client->modeset_mutex); out: kfree(crtcs); modes_destroy(dev, modes, connector_count); kfree(modes); kfree(offsets); kfree(enabled); free_connectors: for (i = 0; i < connector_count; i++) drm_connector_put(connectors[i]); kfree(connectors); return ret; } EXPORT_SYMBOL(drm_client_modeset_probe); /** * drm_client_rotation() - Check the initial rotation value * @modeset: DRM modeset * @rotation: Returned rotation value * * This function checks if the primary plane in @modeset can hw rotate * to match the rotation needed on its connector. * * Note: Currently only 0 and 180 degrees are supported. * * Return: * True if the plane can do the rotation, false otherwise. */ bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation) { struct drm_connector *connector = modeset->connectors[0]; struct drm_plane *plane = modeset->crtc->primary; struct drm_cmdline_mode *cmdline; u64 valid_mask = 0; int i; if (!modeset->num_connectors) return false; switch (connector->display_info.panel_orientation) { case DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP: *rotation = DRM_MODE_ROTATE_180; break; case DRM_MODE_PANEL_ORIENTATION_LEFT_UP: *rotation = DRM_MODE_ROTATE_90; break; case DRM_MODE_PANEL_ORIENTATION_RIGHT_UP: *rotation = DRM_MODE_ROTATE_270; break; default: *rotation = DRM_MODE_ROTATE_0; } /** * The panel already defined the default rotation * through its orientation. Whatever has been provided * on the command line needs to be added to that. * * Unfortunately, the rotations are at different bit * indices, so the math to add them up are not as * trivial as they could. * * Reflections on the other hand are pretty trivial to deal with, a * simple XOR between the two handle the addition nicely. */ cmdline = &connector->cmdline_mode; if (cmdline->specified && cmdline->rotation_reflection) { unsigned int cmdline_rest, panel_rest; unsigned int cmdline_rot, panel_rot; unsigned int sum_rot, sum_rest; panel_rot = ilog2(*rotation & DRM_MODE_ROTATE_MASK); cmdline_rot = ilog2(cmdline->rotation_reflection & DRM_MODE_ROTATE_MASK); sum_rot = (panel_rot + cmdline_rot) % 4; panel_rest = *rotation & ~DRM_MODE_ROTATE_MASK; cmdline_rest = cmdline->rotation_reflection & ~DRM_MODE_ROTATE_MASK; sum_rest = panel_rest ^ cmdline_rest; *rotation = (1 << sum_rot) | sum_rest; } /* * TODO: support 90 / 270 degree hardware rotation, * depending on the hardware this may require the framebuffer * to be in a specific tiling format. */ if (((*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_0 && (*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_180) || !plane->rotation_property) return false; for (i = 0; i < plane->rotation_property->num_values; i++) valid_mask |= (1ULL << plane->rotation_property->values[i]); if (!(*rotation & valid_mask)) return false; return true; } EXPORT_SYMBOL(drm_client_rotation); static int drm_client_modeset_commit_atomic(struct drm_client_dev *client, bool active, bool check) { struct drm_device *dev = client->dev; struct drm_plane *plane; struct drm_atomic_state *state; struct drm_modeset_acquire_ctx ctx; struct drm_mode_set *mode_set; int ret; drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out_ctx; } state->acquire_ctx = &ctx; retry: drm_for_each_plane(plane, dev) { struct drm_plane_state *plane_state; plane_state = drm_atomic_get_plane_state(state, plane); if (IS_ERR(plane_state)) { ret = PTR_ERR(plane_state); goto out_state; } plane_state->rotation = DRM_MODE_ROTATE_0; /* disable non-primary: */ if (plane->type == DRM_PLANE_TYPE_PRIMARY) continue; ret = __drm_atomic_helper_disable_plane(plane, plane_state); if (ret != 0) goto out_state; } drm_client_for_each_modeset(mode_set, client) { struct drm_plane *primary = mode_set->crtc->primary; unsigned int rotation; if (drm_client_rotation(mode_set, &rotation)) { struct drm_plane_state *plane_state; /* Cannot fail as we've already gotten the plane state above */ plane_state = drm_atomic_get_new_plane_state(state, primary); plane_state->rotation = rotation; } ret = __drm_atomic_helper_set_config(mode_set, state); if (ret != 0) goto out_state; /* * __drm_atomic_helper_set_config() sets active when a * mode is set, unconditionally clear it if we force DPMS off */ if (!active) { struct drm_crtc *crtc = mode_set->crtc; struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc); crtc_state->active = false; } } if (check) ret = drm_atomic_check_only(state); else ret = drm_atomic_commit(state); out_state: if (ret == -EDEADLK) goto backoff; drm_atomic_state_put(state); out_ctx: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; backoff: drm_atomic_state_clear(state); drm_modeset_backoff(&ctx); goto retry; } static int drm_client_modeset_commit_legacy(struct drm_client_dev *client) { struct drm_device *dev = client->dev; struct drm_mode_set *mode_set; struct drm_plane *plane; int ret = 0; drm_modeset_lock_all(dev); drm_for_each_plane(plane, dev) { if (plane->type != DRM_PLANE_TYPE_PRIMARY) drm_plane_force_disable(plane); if (plane->rotation_property) drm_mode_plane_set_obj_prop(plane, plane->rotation_property, DRM_MODE_ROTATE_0); } drm_client_for_each_modeset(mode_set, client) { struct drm_crtc *crtc = mode_set->crtc; if (crtc->funcs->cursor_set2) { ret = crtc->funcs->cursor_set2(crtc, NULL, 0, 0, 0, 0, 0); if (ret) goto out; } else if (crtc->funcs->cursor_set) { ret = crtc->funcs->cursor_set(crtc, NULL, 0, 0, 0); if (ret) goto out; } ret = drm_mode_set_config_internal(mode_set); if (ret) goto out; } out: drm_modeset_unlock_all(dev); return ret; } /** * drm_client_modeset_check() - Check modeset configuration * @client: DRM client * * Check modeset configuration. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_check(struct drm_client_dev *client) { int ret; if (!drm_drv_uses_atomic_modeset(client->dev)) return 0; mutex_lock(&client->modeset_mutex); ret = drm_client_modeset_commit_atomic(client, true, true); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_check); /** * drm_client_modeset_commit_locked() - Force commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs without checking if there is a DRM * master. The assumption is that the caller already holds an internal DRM * master reference acquired with drm_master_internal_acquire(). * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit_locked(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, true, false); else ret = drm_client_modeset_commit_legacy(client); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit_locked); /** * drm_client_modeset_commit() - Commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; if (!drm_master_internal_acquire(dev)) return -EBUSY; ret = drm_client_modeset_commit_locked(client); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit); static void drm_client_modeset_dpms_legacy(struct drm_client_dev *client, int dpms_mode) { struct drm_device *dev = client->dev; struct drm_connector *connector; struct drm_mode_set *modeset; struct drm_modeset_acquire_ctx ctx; int ret; DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, 0, ret); drm_client_for_each_modeset(modeset, client) { int j; if (!modeset->crtc->enabled) continue; for (j = 0; j < modeset->num_connectors; j++) { connector = modeset->connectors[j]; connector->funcs->dpms(connector, dpms_mode); drm_object_property_set_value(&connector->base, dev->mode_config.dpms_property, dpms_mode); } } DRM_MODESET_LOCK_ALL_END(dev, ctx, ret); } /** * drm_client_modeset_dpms() - Set DPMS mode * @client: DRM client * @mode: DPMS mode * * Note: For atomic drivers @mode is reduced to on/off. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_dpms(struct drm_client_dev *client, int mode) { struct drm_device *dev = client->dev; int ret = 0; if (!drm_master_internal_acquire(dev)) return -EBUSY; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, mode == DRM_MODE_DPMS_ON, false); else drm_client_modeset_dpms_legacy(client, mode); mutex_unlock(&client->modeset_mutex); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_dpms); #ifdef CONFIG_DRM_KUNIT_TEST #include "tests/drm_client_modeset_test.c" #endif |
| 579 1016 1015 1016 718 718 717 717 718 633 633 62 632 633 579 579 579 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | // SPDX-License-Identifier: GPL-2.0 #include <linux/swap_cgroup.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/swapops.h> /* depends on mm.h include */ static DEFINE_MUTEX(swap_cgroup_mutex); /* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */ #define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short)) #define ID_SHIFT (BITS_PER_TYPE(unsigned short)) #define ID_MASK (BIT(ID_SHIFT) - 1) struct swap_cgroup { atomic_t ids; }; struct swap_cgroup_ctrl { struct swap_cgroup *map; }; static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map, pgoff_t offset) { unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids); BUILD_BUG_ON(!is_power_of_2(ID_PER_SC)); BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t)); return (old_ids >> shift) & ID_MASK; } static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, pgoff_t offset, unsigned short new_id) { unsigned short old_id; struct swap_cgroup *sc = &map[offset / ID_PER_SC]; unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; unsigned int new_ids, old_ids = atomic_read(&sc->ids); do { old_id = (old_ids >> shift) & ID_MASK; new_ids = (old_ids & ~(ID_MASK << shift)); new_ids |= ((unsigned int)new_id) << shift; } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids)); return old_id; } /** * swap_cgroup_record - record mem_cgroup for a set of swap entries. * These entries must belong to one single folio, and that folio * must be being charged for swap space (swap out), and these * entries must not have been charged * * @folio: the folio that the swap entry belongs to * @id: mem_cgroup ID to be recorded * @ent: the first swap entry to be recorded */ void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent) { unsigned int nr_ents = folio_nr_pages(folio); struct swap_cgroup *map; pgoff_t offset, end; unsigned short old; offset = swp_offset(ent); end = offset + nr_ents; map = swap_cgroup_ctrl[swp_type(ent)].map; do { old = __swap_cgroup_id_xchg(map, offset, id); VM_BUG_ON(old); } while (++offset != end); } /** * swap_cgroup_clear - clear mem_cgroup for a set of swap entries. * These entries must be being uncharged from swap. They either * belongs to one single folio in the swap cache (swap in for * cgroup v1), or no longer have any users (slot freeing). * * @ent: the first swap entry to be recorded into * @nr_ents: number of swap entries to be recorded * * Returns the existing old value. */ unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { pgoff_t offset, end; struct swap_cgroup *map; unsigned short old, iter = 0; offset = swp_offset(ent); end = offset + nr_ents; map = swap_cgroup_ctrl[swp_type(ent)].map; do { old = __swap_cgroup_id_xchg(map, offset, 0); if (!iter) iter = old; VM_BUG_ON(iter != old); } while (++offset != end); return old; } /** * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); } int swap_cgroup_swapon(int type, unsigned long max_pages) { struct swap_cgroup *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC != sizeof(struct swap_cgroup)); map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) * sizeof(struct swap_cgroup)); if (!map) goto nomem; ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); ctrl->map = map; mutex_unlock(&swap_cgroup_mutex); return 0; nomem: pr_info("couldn't allocate enough memory for swap_cgroup\n"); pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } void swap_cgroup_swapoff(int type) { struct swap_cgroup *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return; mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; ctrl->map = NULL; mutex_unlock(&swap_cgroup_mutex); vfree(map); } |
| 82 81 82 82 78 82 4 4 4 78 78 78 77 103 102 11 12 10 91 91 91 91 91 91 91 90 91 91 90 9 7 9 9 9 2 2 2 2 9 9 9 8 91 91 91 91 91 90 91 91 91 91 91 91 91 91 91 91 91 91 91 91 90 90 91 91 90 91 78 91 91 91 91 90 91 91 91 90 91 91 91 91 91 91 91 91 91 91 91 91 91 91 91 91 91 91 91 91 78 78 78 78 78 78 77 78 78 78 78 78 78 78 78 78 78 78 78 78 89 91 91 91 90 91 91 90 91 91 91 91 91 78 78 78 78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006 Thomas Gleixner * * This file contains driver APIs to the irq subsystem. */ #define pr_fmt(fmt) "genirq: " fmt #include <linux/irq.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/sched/task.h> #include <linux/sched/isolation.h> #include <uapi/linux/sched/types.h> #include <linux/task_work.h> #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); static int __init setup_forced_irqthreads(char *arg) { static_branch_enable(&force_irqthreads_key); return 0; } early_param("threadirqs", setup_forced_irqthreads); #endif static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { /* * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ guard(raw_spinlock_irqsave)(&desc->lock); inprogress = irqd_irq_inprogress(&desc->irq_data); /* * If requested and supported, check at the chip whether it * is in flight at the hardware level, i.e. already pending * in a CPU and waiting for service and acknowledge. */ if (!inprogress && sync_chip) { /* * Ignore the return code. inprogress is only updated * when the chip supports it. */ __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, &inprogress); } /* Oops, that failed? */ } while (inprogress); } /** * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending hard IRQ handlers for this interrupt * to complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. It does not take * associated threaded handlers into account. * * Do not use this for shutdown scenarios where you must be sure that all * parts (hardirq and threaded handler) have completed. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. * * It does not check whether there is an interrupt in flight at the * hardware level, but not serviced yet, as this might deadlock when called * with interrupts disabled and the target CPU of the interrupt is the * current CPU. */ bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { __synchronize_hardirq(desc, false); return !atomic_read(&desc->threads_active); } return true; } EXPORT_SYMBOL(synchronize_hardirq); static void __synchronize_irq(struct irq_desc *desc) { __synchronize_hardirq(desc, true); /* * We made sure that no hardirq handler is running. Now verify that no * threaded handlers are active. */ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); } /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending IRQ handlers for this interrupt to * complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when * an interrupt thread is associated to @irq. * * It optionally makes sure (when the irq chip supports that method) * that the interrupt is not pending in any CPU and waiting for * service. */ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) __synchronize_irq(desc); } EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return false; return true; } /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check * */ int irq_can_set_affinity(unsigned int irq) { return __irq_can_set_affinity(irq_to_desc(irq)); } /** * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space * @irq: Interrupt to check * * Like irq_can_set_affinity() above, but additionally checks for the * AFFINITY_MANAGED flag. */ bool irq_can_set_affinity_usr(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); return __irq_can_set_affinity(desc) && !irqd_affinity_is_managed(&desc->irq_data); } /** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affinity changed * * Just set IRQTF_AFFINITY and delegate the affinity setting to the * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as * we hold desc->lock and this code can be called from hard interrupt * context. */ static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; for_each_action_of_desc(desc, action) { if (action->thread) { set_bit(IRQTF_AFFINITY, &action->thread_flags); wake_up_process(action->thread); } if (action->secondary && action->secondary->thread) { set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags); wake_up_process(action->secondary->thread); } } } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); if (!cpumask_empty(m)) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); } #else static inline void irq_validate_effective_affinity(struct irq_data *data) { } #endif static DEFINE_PER_CPU(struct cpumask, __tmp_mask); int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask); struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); const struct cpumask *prog_mask; int ret; if (!chip || !chip->irq_set_affinity) return -EINVAL; /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with * a housekeeping CPU. If so, then remove the isolated CPUs from * the mask and just keep the housekeeping CPU(s). This prevents * the affinity setter from routing the interrupt to an isolated * CPU to avoid that I/O submitted from a housekeeping CPU causes * interrupts on an isolated one. * * If the masks do not intersect or include online CPU(s) then * keep the requested mask. The isolated target CPUs are only * receiving interrupts when the I/O operation was submitted * directly from them. * * If all housekeeping CPUs in the affinity mask are offline, the * interrupt will be migrated by the CPU hotplug code once a * housekeeping CPU which belongs to the affinity mask comes * online. */ if (irqd_affinity_is_managed(data) && housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { const struct cpumask *hk_mask; hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); cpumask_and(tmp_mask, mask, hk_mask); if (!cpumask_intersects(tmp_mask, cpu_online_mask)) prog_mask = mask; else prog_mask = tmp_mask; } else { prog_mask = mask; } /* * Make sure we only provide online CPUs to the irqchip, * unless we are being asked to force the affinity (in which * case we do as we are told). */ cpumask_and(tmp_mask, prog_mask, cpu_online_mask); if (!force && !cpumask_empty(tmp_mask)) ret = chip->irq_set_affinity(data, tmp_mask, force); else if (force) ret = chip->irq_set_affinity(data, mask, force); else ret = -EINVAL; switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); ret = 0; } return ret; } #ifdef CONFIG_GENERIC_PENDING_IRQ static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { struct irq_desc *desc = irq_data_to_desc(data); irqd_set_move_pending(data); irq_copy_pending(desc, dest); return 0; } #else static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { return -EBUSY; } #endif static int irq_try_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { int ret = irq_do_set_affinity(data, dest, force); /* * In case that the underlying vector management is busy and the * architecture supports the generic pending mechanism then utilize * this to avoid returning an error to user space. */ if (ret == -EBUSY && !force) ret = irq_set_affinity_pending(data, dest); return ret; } static bool irq_set_affinity_deactivated(struct irq_data *data, const struct cpumask *mask) { struct irq_desc *desc = irq_data_to_desc(data); /* * Handle irq chips which can handle affinity only in activated * state correctly * * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a * usable state so startup works. */ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data) || !irqd_affinity_on_activate(data)) return false; cpumask_copy(desc->irq_common_data.affinity, mask); irq_data_update_effective_affinity(data, mask); irqd_set(data, IRQD_AFFINITY_SET); return true; } int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { struct irq_chip *chip = irq_data_get_irq_chip(data); struct irq_desc *desc = irq_data_to_desc(data); int ret = 0; if (!chip || !chip->irq_set_affinity) return -EINVAL; if (irq_set_affinity_deactivated(data, mask)) return 0; if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); } if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); if (!schedule_work(&desc->affinity_notify->work)) { /* Work was already scheduled, drop our extra ref */ kref_put(&desc->affinity_notify->kref, desc->affinity_notify->release); } } irqd_set(data, IRQD_AFFINITY_SET); return ret; } /** * irq_update_affinity_desc - Update affinity management for an interrupt * @irq: The interrupt number to update * @affinity: Pointer to the affinity descriptor * * This interface can be used to configure the affinity management of * interrupts which have been allocated already. * * There are certain limitations on when it may be used - attempts to use it * for when the kernel is configured for generic IRQ reservation mode (in * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with * managed/non-managed interrupt accounting. In addition, attempts to use it on * an interrupt which is already started or which has already been configured * as managed will also fail, as these mean invalid init state or double init. */ int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { /* * Supporting this with the reservation scheme used by x86 needs * some more thought. Fail it for now. */ if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) return -EOPNOTSUPP; scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_desc *desc = scoped_irqdesc; bool activated; /* Requires the interrupt to be shut down */ if (irqd_is_started(&desc->irq_data)) return -EBUSY; /* Interrupts which are already managed cannot be modified */ if (irqd_affinity_is_managed(&desc->irq_data)) return -EBUSY; /* * Deactivate the interrupt. That's required to undo * anything an earlier activation has established. */ activated = irqd_is_activated(&desc->irq_data); if (activated) irq_domain_deactivate_irq(&desc->irq_data); if (affinity->is_managed) { irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); } cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); /* Restore the activation state */ if (activated) irq_domain_activate_irq(&desc->irq_data, false); return 0; } return -EINVAL; } static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); if (!desc) return -EINVAL; guard(raw_spinlock_irqsave)(&desc->lock); return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); } /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Fails if cpumask does not contain an online CPU */ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, false); } EXPORT_SYMBOL_GPL(irq_set_affinity); /** * irq_force_affinity - Force the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Same as irq_set_affinity, but without checking the mask against * online cpus. * * Solely for low level cpu hotplug code, where we need to make per * cpu interrupts affine before the cpu becomes online. */ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, true); } EXPORT_SYMBOL_GPL(irq_force_affinity); int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity) { int ret = -EINVAL; scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { scoped_irqdesc->affinity_hint = m; ret = 0; } if (!ret && m && setaffinity) __irq_set_affinity(irq, m, false); return ret; } EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; scoped_guard(raw_spinlock_irqsave, &desc->lock) { if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else cpumask_copy(cpumask, desc->irq_common_data.affinity); } notify->notify(notify, cpumask); free_cpumask_var(cpumask); out: kref_put(¬ify->kref, notify->release); } /** * irq_set_affinity_notifier - control notification of IRQ affinity changes * @irq: Interrupt for which to enable/disable notification * @notify: Context for notification, or %NULL to disable * notification. Function pointers must be initialised; * the other fields will be initialised by this function. * * Must be called in process context. Notification may only be enabled * after the IRQ is allocated and must be disabled before the IRQ is freed * using free_irq(). */ int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { struct irq_desc *desc = irq_to_desc(irq); struct irq_affinity_notify *old_notify; /* The release function is promised process context */ might_sleep(); if (!desc || irq_is_nmi(desc)) return -EINVAL; /* Complete initialisation of *notify */ if (notify) { notify->irq = irq; kref_init(¬ify->kref); INIT_WORK(¬ify->work, irq_affinity_notify); } scoped_guard(raw_spinlock_irqsave, &desc->lock) { old_notify = desc->affinity_notify; desc->affinity_notify = notify; } if (old_notify) { if (cancel_work_sync(&old_notify->work)) { /* Pending work had a ref, put that one too */ kref_put(&old_notify->kref, old_notify->release); } kref_put(&old_notify->kref, old_notify->release); } return 0; } EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. */ int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; int node = irq_desc_get_node(desc); static DEFINE_RAW_SPINLOCK(mask_lock); static struct cpumask mask; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!__irq_can_set_affinity(desc)) return 0; guard(raw_spinlock)(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. */ if (irqd_affinity_is_managed(&desc->irq_data) || irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } cpumask_and(&mask, cpu_online_mask, set); if (cpumask_empty(&mask)) cpumask_copy(&mask, cpu_online_mask); if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); /* make sure at least one of the cpus in nodemask is online */ if (cpumask_intersects(&mask, nodemask)) cpumask_and(&mask, &mask, nodemask); } return irq_do_set_affinity(&desc->irq_data, &mask, false); } #else /* Wrapper for ALPHA specific affinity selector magic */ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } #endif /* CONFIG_AUTO_IRQ_AFFINITY */ #endif /* CONFIG_SMP */ /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt * @irq: interrupt number to set affinity * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU * specific data for percpu_devid interrupts * * This function uses the vCPU specific data to set the vCPU affinity for * an irq. The vCPU specific data is passed from outside, such as KVM. One * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity(). */ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) { scoped_irqdesc_get_and_lock(irq, 0) { struct irq_desc *desc = scoped_irqdesc; struct irq_data *data; struct irq_chip *chip; data = irq_desc_get_irq_data(desc); do { chip = irq_data_get_irq_chip(data); if (chip && chip->irq_set_vcpu_affinity) break; data = irqd_get_parent_data(data); } while (data); if (!data) return -ENOSYS; return chip->irq_set_vcpu_affinity(data, vcpu_info); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); void __disable_irq(struct irq_desc *desc) { if (!desc->depth++) irq_disable(desc); } static int __disable_irq_nosync(unsigned int irq) { scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { __disable_irq(scoped_irqdesc); return 0; } return -EINVAL; } /** * disable_irq_nosync - disable an irq without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and Enables are * nested. * Unlike disable_irq(), this function does not ensure existing * instances of the IRQ handler have completed before returning. * * This function may be called from IRQ context. */ void disable_irq_nosync(unsigned int irq) { __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); /** * disable_irq - disable an irq and wait for completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are nested. * * This function waits for any pending IRQ handlers for this interrupt to * complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when an * interrupt thread is associated to @irq. * */ void disable_irq(unsigned int irq) { might_sleep(); if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); /** * disable_hardirq - disables an irq and waits for hardirq completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are nested. * * This function waits for any pending hard IRQ handlers for this interrupt * to complete before returning. If you use this function while holding a * resource the hard IRQ handler may need you will deadlock. * * When used to optimistically disable an interrupt from atomic context the * return value must be checked. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. */ bool disable_hardirq(unsigned int irq) { if (!__disable_irq_nosync(irq)) return synchronize_hardirq(irq); return false; } EXPORT_SYMBOL_GPL(disable_hardirq); /** * disable_nmi_nosync - disable an nmi without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and enables are nested. * * The interrupt to disable must have been requested through request_nmi. * Unlike disable_nmi(), this function does not ensure existing * instances of the IRQ handler have completed before returning. */ void disable_nmi_nosync(unsigned int irq) { disable_irq_nosync(irq); } void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { case 0: err_out: WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq_desc_get_irq(desc)); break; case 1: { if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); /* * Call irq_startup() not irq_enable() here because the * interrupt might be marked NOAUTOEN so irq_startup() * needs to be invoked when it gets enabled the first time. * This is also required when __enable_irq() is invoked for * a managed and shutdown interrupt from the S3 resume * path. * * If it was already started up, then irq_startup() will * invoke irq_enable() under the hood. */ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; } default: desc->depth--; } } /** * enable_irq - enable handling of an irq * @irq: Interrupt to enable * * Undoes the effect of one call to disable_irq(). If this matches the * last disable, processing of interrupts on this IRQ line is re-enabled. * * This function may be called from IRQ context only when * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq)) return; __enable_irq(desc); } } EXPORT_SYMBOL(enable_irq); /** * enable_nmi - enable handling of an nmi * @irq: Interrupt to enable * * The interrupt to enable must have been requested through request_nmi. * Undoes the effect of one call to disable_nmi(). If this matches the last * disable, processing of interrupts on this IRQ line is re-enabled. */ void enable_nmi(unsigned int irq) { enable_irq(irq); } static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE) return 0; if (desc->irq_data.chip->irq_set_wake) ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); return ret; } /** * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * * Enable/disable power management wakeup mode, which is disabled by * default. Enables and disables must match, just as they match for * non-wakeup mode support. * * Wakeup mode lets this IRQ wake the system from sleep states like * "suspend to RAM". * * Note: irq enable/disable state is completely orthogonal to the * enable/disable state of irq wake. An irq can be disabled with * disable_irq() and still wake the system as long as the irq has wake * enabled. If this does not hold, then the underlying irq chip and the * related driver need to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; int ret = 0; /* Don't use NMIs as wake up interrupts please */ if (irq_is_nmi(desc)) return -EINVAL; /* * wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 0; else irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { WARN(1, "Unbalanced IRQ %d wake disable\n", irq); } else if (--desc->wake_depth == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 1; else irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } return ret; } return -EINVAL; } EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. */ bool can_request_irq(unsigned int irq, unsigned long irqflags) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (irq_settings_can_request(desc)) { if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED) return true; } } return false; } int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) { struct irq_chip *chip = desc->irq_data.chip; int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ pr_debug("No set_type function for IRQ %d (%s)\n", irq_desc_get_irq(desc), chip ? (chip->name ? : "unknown") : "unknown"); return 0; } if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); if (!irqd_irq_disabled(&desc->irq_data)) unmask = 1; } /* Mask all flags except trigger mode */ flags &= IRQ_TYPE_SENSE_MASK; ret = chip->irq_set_type(&desc->irq_data, flags); switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); irq_settings_set_trigger_mask(desc, flags); irqd_clear(&desc->irq_data, IRQD_LEVEL); irq_settings_clr_level(desc); if (flags & IRQ_TYPE_LEVEL_MASK) { irq_settings_set_level(desc); irqd_set(&desc->irq_data, IRQD_LEVEL); } ret = 0; break; default: pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) unmask_irq(desc); return ret; } #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq) { scoped_irqdesc_get_and_lock(irq, 0) { scoped_irqdesc->parent_irq = parent_irq; return 0; } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* * Default primary interrupt handler for threaded interrupts. Is * assigned as primary handler when request_threaded_irq is called * with handler == NULL. Useful for oneshot interrupts. */ static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) { return IRQ_WAKE_THREAD; } /* * Primary handler for nested threaded interrupts. Should never be * called. */ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) { WARN(1, "Primary handler called for nested irq %d\n", irq); return IRQ_NONE; } static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) { WARN(1, "Secondary action handler called for irq %d\n", irq); return IRQ_NONE; } #ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; bool valid = false; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; __set_current_state(TASK_RUNNING); /* * In case we are out of memory we set IRQTF_AFFINITY again and * try again next time */ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { set_bit(IRQTF_AFFINITY, &action->thread_flags); return; } scoped_guard(raw_spinlock_irq, &desc->lock) { /* * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ if (cpumask_available(desc->irq_common_data.affinity)) { const struct cpumask *m; m = irq_data_get_effective_affinity_mask(&desc->irq_data); cpumask_copy(mask, m); valid = true; } } if (valid) set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif static int irq_wait_for_interrupt(struct irq_desc *desc, struct irqaction *action) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); irq_thread_check_affinity(desc, action); if (kthread_should_stop()) { /* may need to run one last time */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } __set_current_state(TASK_RUNNING); return -1; } if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } schedule(); } } /* * Oneshot interrupts keep the irq line masked until the threaded * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ static void irq_finalize_oneshot(struct irq_desc *desc, struct irqaction *action) { if (!(desc->istate & IRQS_ONESHOT) || action->handler == irq_forced_secondary_handler) return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); /* * Implausible though it may be we need to protect us against * the following scenario: * * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due * to IRQS_INPROGRESS and the irq line is masked forever. * * This also serializes the state of shared oneshot handlers * versus "desc->threads_oneshot |= action->thread_mask;" in * irq_wake_thread(). See the comment there which explains the * serialization. */ if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); goto again; } /* * Now check again, whether the thread should run. Otherwise * we would clear the threads_oneshot bit of this thread which * was just set. */ if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) goto out_unlock; desc->threads_oneshot &= ~action->thread_mask; if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) unmask_threaded_irq(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } /* * Interrupts explicitly requested as threaded interrupts want to be * preemptible - many of them need to sleep and wait for slow busses to * complete. */ static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret = action->thread_fn(action->irq, action->dev_id); if (ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); irq_finalize_oneshot(desc, action); return ret; } /* * Interrupts which are not explicitly requested as threaded * interrupts rely on the implicit bh/preempt disable of the hard irq * context. So we need to disable bh here to avoid deadlocks and other * side effects. */ static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret; local_bh_disable(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); ret = irq_thread_fn(desc, action); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); local_bh_enable(); return ret; } void wake_threads_waitq(struct irq_desc *desc) { if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } static void irq_thread_dtor(struct callback_head *unused) { struct task_struct *tsk = current; struct irq_desc *desc; struct irqaction *action; if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) return; action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); /* * If IRQTF_RUNTHREAD is set, we need to decrement * desc->threads_active and wake possible waiters. */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) wake_threads_waitq(desc); /* Prevent a stale desc->threads_oneshot */ irq_finalize_oneshot(desc, action); } static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) { struct irqaction *secondary = action->secondary; if (WARN_ON_ONCE(!secondary)) return; guard(raw_spinlock_irq)(&desc->lock); __irq_wake_thread(desc, secondary); } /* * Internal function to notify that a interrupt thread is ready. */ static void irq_thread_set_ready(struct irq_desc *desc, struct irqaction *action) { set_bit(IRQTF_READY, &action->thread_flags); wake_up(&desc->wait_for_threads); } /* * Internal function to wake up a interrupt thread and wait until it is * ready. */ static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc, struct irqaction *action) { if (!action || !action->thread) return; wake_up_process(action->thread); wait_event(desc->wait_for_threads, test_bit(IRQTF_READY, &action->thread_flags)); } /* * Interrupt handler thread */ static int irq_thread(void *data) { struct callback_head on_exit_work; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); irq_thread_set_ready(desc, action); sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; else handler_fn = irq_thread_fn; init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, TWA_NONE); while (!irq_wait_for_interrupt(desc, action)) { irqreturn_t action_ret; action_ret = handler_fn(desc, action); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); wake_threads_waitq(desc); } /* * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ task_work_cancel_func(current, irq_thread_dtor); return 0; } /** * irq_wake_thread - wake the irq thread for the action identified by dev_id * @irq: Interrupt line * @dev_id: Device identity for which the thread should be woken */ void irq_wake_thread(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; guard(raw_spinlock_irqsave)(&desc->lock); for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); break; } } } EXPORT_SYMBOL_GPL(irq_wake_thread); static int irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads()) return 0; if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; /* * No further action required for interrupts which are requested as * threaded interrupts already */ if (new->handler == irq_default_primary_handler) return 0; new->flags |= IRQF_ONESHOT; /* * Handle the case where we have a real primary handler and a * thread handler. We force thread them as well by creating a * secondary action. */ if (new->handler && new->thread_fn) { /* Allocate the secondary action */ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!new->secondary) return -ENOMEM; new->secondary->handler = irq_forced_secondary_handler; new->secondary->thread_fn = new->thread_fn; new->secondary->dev_id = new->dev_id; new->secondary->irq = new->irq; new->secondary->name = new->name; } /* Deal with the primary handler */ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); new->thread_fn = new->handler; new->handler = irq_default_primary_handler; return 0; } static int irq_request_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; return c->irq_request_resources ? c->irq_request_resources(d) : 0; } static void irq_release_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; if (c->irq_release_resources) c->irq_release_resources(d); } static bool irq_supports_nmi(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /* Only IRQs directly managed by the root irqchip can be set as NMI */ if (d->parent_data) return false; #endif /* Don't support NMIs for chips behind a slow bus */ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock) return false; return d->chip->flags & IRQCHIP_SUPPORTS_NMI; } static int irq_nmi_setup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL; } static void irq_nmi_teardown(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; if (c->irq_nmi_teardown) c->irq_nmi_teardown(d); } static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); } if (IS_ERR(t)) return PTR_ERR(t); /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code * references an already freed task_struct. */ new->thread = get_task_struct(t); /* * Tell the thread to set its affinity. This is * important for shared interrupt handlers as we do * not invoke setup_affinity() for the secondary * handlers as everything is already set up. Even for * interrupts marked with IRQF_NO_BALANCE this is * correct as we want the thread to move to the cpu(s) * on which the requesting code placed the interrupt. */ set_bit(IRQTF_AFFINITY, &new->thread_flags); return 0; } /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. * * Locking rules: * * desc->request_mutex Provides serialization against a concurrent free_irq() * chip_bus_lock Provides serialization for slow bus operations * desc->lock Provides serialization against hard interrupts * * chip_bus_lock and desc->lock are sufficient for all other management and * interrupt related functions. desc->request_mutex solely serializes * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; if (!desc) return -EINVAL; if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; if (!try_module_get(desc->owner)) return -ENODEV; new->irq = irq; /* * If the trigger type is not specified by the caller, * then use the default for this interrupt. */ if (!(new->flags & IRQF_TRIGGER_MASK)) new->flags |= irqd_get_trigger_type(&desc->irq_data); /* * Check whether the interrupt nests into another interrupt * thread. */ nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) { ret = -EINVAL; goto out_mput; } /* * Replace the primary handler which was provided from * the driver for non nested interrupt handling by the * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; } else { if (irq_settings_can_thread(desc)) { ret = irq_setup_forced_threading(new); if (ret) goto out_mput; } } /* * Create a handler thread when a thread function is supplied * and the interrupt does not nest into another interrupt * thread. */ if (new->thread_fn && !nested) { ret = setup_irq_thread(new, irq, false); if (ret) goto out_mput; if (new->secondary) { ret = setup_irq_thread(new->secondary, irq, true); if (ret) goto out_thread; } } /* * Drivers are often written to work w/o knowledge about the * underlying irq chip implementation, so a request for a * threaded irq without a primary hard irq context handler * requires the ONESHOT flag to be set. Some irq chips like * MSI based interrupts are per se one shot safe. Check the * chip flags, so we can avoid the unmask dance at the end of * the threaded handler for those. */ if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; /* * Protects against a concurrent __free_irq() call which might wait * for synchronize_hardirq() to complete without holding the optional * chip bus lock and desc->lock. Also protects against handing out * a recycled oneshot thread_mask bit while it's still in use by * its previous owner. */ mutex_lock(&desc->request_mutex); /* * Acquire bus lock as the irq_request_resources() callback below * might rely on the serialization or the magic power management * functions which are abusing the irq_bus_lock() callback, */ chip_bus_lock(desc); /* First installed action requests resources. */ if (!desc->action) { ret = irq_request_resources(desc); if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); goto out_bus_unlock; } } /* * The following block of code has to be executed atomically * protected against a concurrent interrupt and any of the other * management calls which are not serialized via * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; old = *old_ptr; if (old) { /* * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which * set the trigger type must match. Also all must * agree on ONESHOT. * Interrupt lines used for NMIs cannot be shared. */ unsigned int oldtype; if (irq_is_nmi(desc)) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; goto out_unlock; } /* * If nobody did set the configuration before, inherit * the one provided by the requester. */ if (irqd_trigger_type_was_set(&desc->irq_data)) { oldtype = irqd_get_trigger_type(&desc->irq_data); } else { oldtype = new->flags & IRQF_TRIGGER_MASK; irqd_set_trigger_type(&desc->irq_data, oldtype); } if (!((old->flags & new->flags) & IRQF_SHARED) || (oldtype != (new->flags & IRQF_TRIGGER_MASK))) goto mismatch; if ((old->flags & IRQF_ONESHOT) && (new->flags & IRQF_COND_ONESHOT)) new->flags |= IRQF_ONESHOT; else if ((old->flags ^ new->flags) & IRQF_ONESHOT) goto mismatch; /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; /* add new interrupt at end of irq queue */ do { /* * Or all existing action->thread_mask bits, * so we can find the next zero bit for this * new action. */ thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } /* * Setup the thread mask for this irqaction for ONESHOT. For * !ONESHOT irqs the thread mask is 0 so we can avoid a * conditional in irq_wake_thread(). */ if (new->flags & IRQF_ONESHOT) { /* * Unlikely to have 32 resp 64 irqs sharing one line, * but who knows. */ if (thread_mask == ~0UL) { ret = -EBUSY; goto out_unlock; } /* * The thread_mask for the action is or'ed to * desc->thread_active to indicate that the * IRQF_ONESHOT thread handler has been woken, but not * yet finished. The bit is cleared when a thread * completes. When all threads of a shared interrupt * line have completed desc->threads_active becomes * zero and the interrupt line is unmasked. See * handle.c:irq_wake_thread() for further information. * * If no thread is woken by primary (hard irq context) * interrupt handlers, then desc->threads_active is * also checked for zero to unmask the irq line in the * affected hard irq flow handlers * (handle_[fasteoi|level]_irq). * * The new action gets the first zero bit of * thread_mask assigned. See the loop above which or's * all existing action->thread_mask bits. */ new->thread_mask = 1UL << ffz(thread_mask); } else if (new->handler == irq_default_primary_handler && !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it * does not have the oneshot flag set. In combination * with level interrupts this is deadly, because the * default primary handler just wakes the thread, then * the irq lines is reenabled, but the device still * has the level irq asserted. Rinse and repeat.... * * While this works for edge type interrupts, we play * it safe and reject unconditionally because we can't * say for sure which type this interrupt really * has. The type flags are unreliable as the * underlying chip implementation can override them. */ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n", new->name, irq); ret = -EINVAL; goto out_unlock; } if (!shared) { /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); if (ret) goto out_unlock; } /* * Activate the interrupt. That activation must happen * independently of IRQ_NOAUTOEN. request_irq() can fail * and the callers are supposed to handle * that. enable_irq() of an interrupt requested with * IRQ_NOAUTOEN is not supposed to fail. The activation * keeps it in shutdown mode, it merily associates * resources if necessary and if that's not possible it * fails. Interrupts which are in managed shutdown mode * will simply ignore that activation request. */ ret = irq_activate(desc); if (ret) goto out_unlock; desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_ONESHOT | IRQS_WAITING); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); if (new->flags & IRQF_PERCPU) { irqd_set(&desc->irq_data, IRQD_PER_CPU); irq_settings_set_per_cpu(desc); if (new->flags & IRQF_NO_DEBUG) irq_settings_set_no_debug(desc); } if (noirqdebug) irq_settings_set_no_debug(desc); if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; /* Exclude IRQ from balancing if requested */ if (new->flags & IRQF_NOBALANCING) { irq_settings_set_no_balancing(desc); irqd_set(&desc->irq_data, IRQD_NO_BALANCING); } if (!(new->flags & IRQF_NO_AUTOEN) && irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); } else { /* * Shared interrupts do not go well with disabling * auto enable. The sharing interrupt might request * it while it's still disabled and then wait for * interrupts forever. */ WARN_ON_ONCE(new->flags & IRQF_SHARED); /* Undo nested disables: */ desc->depth = 1; } } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); if (nmsk != omsk) /* hope the handler works with current trigger mode */ pr_warn("irq %d uses trigger mode %u; requested %u\n", irq, omsk, nmsk); } *old_ptr = new; irq_pm_install_action(desc, new); /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; /* * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); irq_setup_timings(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); return 0; mismatch: if (!(new->flags & IRQF_PROBE_SHARED)) { pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", irq, new->flags, new->name, old->flags, old->name); #ifdef CONFIG_DEBUG_SHIRQ dump_stack(); #endif } ret = -EBUSY; out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); if (!desc->action) irq_release_resources(desc); out_bus_unlock: chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); out_thread: if (new->thread) { struct task_struct *t = new->thread; new->thread = NULL; kthread_stop_put(t); } if (new->secondary && new->secondary->thread) { struct task_struct *t = new->secondary->thread; new->secondary->thread = NULL; kthread_stop_put(t); } out_mput: module_put(desc->owner); return ret; } /* * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) { unsigned irq = desc->irq_data.irq; struct irqaction *action, **action_ptr; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* * There can be multiple actions per IRQ descriptor, find the right * one based on the dev_id: */ action_ptr = &desc->action; for (;;) { action = *action_ptr; if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); return NULL; } if (action->dev_id == dev_id) break; action_ptr = &action->next; } /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; irq_pm_remove_action(desc, action); /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_settings_clr_disable_unlazy(desc); /* Only shutdown. Deactivate after synchronize_hardirq() */ irq_shutdown(desc); } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ if (WARN_ON_ONCE(desc->affinity_hint)) desc->affinity_hint = NULL; #endif raw_spin_unlock_irqrestore(&desc->lock, flags); /* * Drop bus_lock here so the changes which were done in the chip * callbacks above are synced out to the irq chips which hang * behind a slow bus (I2C, SPI) before calling synchronize_hardirq(). * * Aside of that the bus_lock can also be taken from the threaded * handler in irq_finalize_oneshot() which results in a deadlock * because kthread_stop() would wait forever for the thread to * complete, which is blocked on the bus lock. * * The still held desc->request_mutex() protects against a * concurrent request_irq() of this irq so the release of resources * and timing data is properly serialized. */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); /* * Make sure it's not being used on another CPU and if the chip * supports it also make sure that there is no (not yet serviced) * interrupt in flight at the hardware level. */ __synchronize_irq(desc); #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ * event to happen even now it's being freed, so let's make sure that * is so by doing an extra call to the handler .... * * ( We do this after actually deregistering it, to make sure that a * 'real' IRQ doesn't run in parallel with our fake. ) */ if (action->flags & IRQF_SHARED) { local_irq_save(flags); action->handler(irq, dev_id); local_irq_restore(flags); } #endif /* * The action has already been removed above, but the thread writes * its oneshot mask bit when it completes. Though request_mutex is * held across this which prevents __setup_irq() from handing out * the same bit to a newly requested action. */ if (action->thread) { kthread_stop_put(action->thread); if (action->secondary && action->secondary->thread) kthread_stop_put(action->secondary->thread); } /* Last action releases resources */ if (!desc->action) { /* * Reacquire bus lock as irq_release_resources() might * require it to deallocate resources over the slow bus. */ chip_bus_lock(desc); /* * There is no interrupt on the fly anymore. Deactivate it * completely. */ scoped_guard(raw_spinlock_irqsave, &desc->lock) irq_domain_deactivate_irq(&desc->irq_data); irq_release_resources(desc); chip_bus_sync_unlock(desc); irq_remove_timings(desc); } mutex_unlock(&desc->request_mutex); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); return action; } /** * free_irq - free an interrupt allocated with request_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove an interrupt handler. The handler is removed and if the interrupt * line is no longer in use by any driver it is disabled. On a shared IRQ * the caller must ensure the interrupt is disabled on the card it drives * before calling this function. The function does not return until any * executing interrupts for this IRQ have completed. * * This function must not be called from interrupt context. * * Returns the devname argument passed to request_irq. */ const void *free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; const char *devname; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; #ifdef CONFIG_SMP if (WARN_ON(desc->affinity_notify)) desc->affinity_notify = NULL; #endif action = __free_irq(desc, dev_id); if (!action) return NULL; devname = action->name; kfree(action); return devname; } EXPORT_SYMBOL(free_irq); /* This function must be called with desc->lock held */ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) { const char *devname = NULL; desc->istate &= ~IRQS_NMI; if (!WARN_ON(desc->action == NULL)) { irq_pm_remove_action(desc, desc->action); devname = desc->action->name; unregister_handler_proc(irq, desc->action); kfree(desc->action); desc->action = NULL; } irq_settings_clr_disable_unlazy(desc); irq_shutdown_and_deactivate(desc); irq_release_resources(desc); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return devname; } const void *free_nmi(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || WARN_ON(!irq_is_nmi(desc))) return NULL; if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; /* NMI still enabled */ if (WARN_ON(desc->depth == 0)) disable_nmi_nosync(irq); guard(raw_spinlock_irqsave)(&desc->lock); irq_nmi_teardown(desc); return __cleanup_nmi(irq, desc); } /** * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts. * If handler is NULL and thread_fn != NULL * the default primary handler is installed. * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. From the point this call is made your handler function * may be invoked. Since your handler function must clear any interrupt the * board raises, you must take care both to initialise your hardware and to * set up the interrupt handler in the right order. * * If you want to set up a threaded irq handler for your device then you * need to supply @handler and @thread_fn. @handler is still called in hard * interrupt context and has to check whether the interrupt originates from * the device. If yes it needs to disable the interrupt on the device and * return IRQ_WAKE_THREAD which will wake up the handler thread and run * @thread_fn. This split handler design is necessary to support shared * interrupts. * * @dev_id must be globally unique. Normally the address of the device data * structure is used as the cookie. Since the handler receives this value * it makes sense to use it. * * If your interrupt is shared you must pass a non NULL dev_id as this is * required when freeing the interrupt. * * Flags: * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level * IRQF_ONESHOT Run thread_fn with interrupt line masked */ int request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). * * Also shared interrupts do not go well with disabling auto enable. * The sharing interrupt might request it while it's still disabled * and then wait for interrupts forever. * * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and * it cannot be set along with IRQF_NO_SUSPEND. */ if (((irqflags & IRQF_SHARED) && !dev_id) || ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) || (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (!irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; if (!handler) { if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->thread_fn = thread_fn; action->flags = irqflags; action->name = devname; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action->secondary); kfree(action); } #ifdef CONFIG_DEBUG_SHIRQ_FIXME if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... * We disable the irq to make sure that a 'real' IRQ doesn't * run in parallel with our fake. */ unsigned long flags; disable_irq(irq); local_irq_save(flags); handler(irq, dev_id); local_irq_restore(flags); enable_irq(irq); } #endif return retval; } EXPORT_SYMBOL(request_threaded_irq); /** * request_any_context_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @flags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. It selects either a hardirq or threaded handling * method depending on the context. * * Returns: On failure, it returns a negative value. On success, it returns either * IRQC_IS_HARDIRQ or IRQC_IS_NESTED. */ int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { struct irq_desc *desc; int ret; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; } ret = request_irq(irq, handler, flags, name, dev_id); return !ret ? IRQC_IS_HARDIRQ : ret; } EXPORT_SYMBOL_GPL(request_any_context_irq); /** * request_nmi - allocate an interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @irqflags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. It sets up the IRQ line to be handled as an NMI. * * An interrupt line delivering NMIs cannot be shared and IRQ handling * cannot be threaded. * * Interrupt lines requested for NMI delivering must produce per cpu * interrupts and have auto enabling setting disabled. * * @dev_id must be globally unique. Normally the address of the device data * structure is used as the cookie. Since the handler receives this value * it makes sense to use it. * * If the interrupt line cannot be used to deliver NMIs, function will fail * and return a negative value. */ int request_nmi(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *name, void *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* NMI cannot be shared, used for Polling */ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL)) return -EINVAL; if (!(irqflags & IRQF_PERCPU)) return -EINVAL; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || (irq_settings_can_autoenable(desc) && !(irqflags & IRQF_NO_AUTOEN)) || !irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc)) || !irq_supports_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; scoped_guard(raw_spinlock_irqsave, &desc->lock) { /* Setup NMI state */ desc->istate |= IRQS_NMI; retval = irq_nmi_setup(desc); if (retval) { __cleanup_nmi(irq, desc); return -EINVAL; } return 0; } err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } void enable_percpu_irq(unsigned int irq, unsigned int type) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { struct irq_desc *desc = scoped_irqdesc; /* * If the trigger type is not specified by the caller, then * use the default for this interrupt. */ type &= IRQ_TYPE_SENSE_MASK; if (type == IRQ_TYPE_NONE) type = irqd_get_trigger_type(&desc->irq_data); if (type != IRQ_TYPE_NONE) { if (__irq_set_trigger(desc, type)) { WARN(1, "failed to set type for IRQ%d\n", irq); return; } } irq_percpu_enable(desc, smp_processor_id()); } } EXPORT_SYMBOL_GPL(enable_percpu_irq); void enable_percpu_nmi(unsigned int irq, unsigned int type) { enable_percpu_irq(irq, type); } /** * irq_percpu_is_enabled - Check whether the per cpu irq is enabled * @irq: Linux irq number to check for * * Must be called from a non migratable context. Returns the enable * state of a per cpu interrupt on the current cpu. */ bool irq_percpu_is_enabled(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled); return false; } EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); void disable_percpu_irq(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) irq_percpu_disable(scoped_irqdesc, smp_processor_id()); } EXPORT_SYMBOL_GPL(disable_percpu_irq); void disable_percpu_nmi(unsigned int irq) { disable_percpu_irq(irq); } /* * Internal function to unregister a percpu irqaction. */ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return NULL; scoped_guard(raw_spinlock_irqsave, &desc->lock) { action = desc->action; if (!action || action->percpu_dev_id != dev_id) { WARN(1, "Trying to free already-free IRQ %d\n", irq); return NULL; } if (!cpumask_empty(desc->percpu_enabled)) { WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq, cpumask_first(desc->percpu_enabled)); return NULL; } /* Found it - now remove it from the list of entries: */ desc->action = NULL; desc->istate &= ~IRQS_NMI; } unregister_handler_proc(irq, action); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; } /** * free_percpu_irq - free an interrupt allocated with request_percpu_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove a percpu interrupt handler. The handler is removed, but the * interrupt line is not disabled. This must be done on each CPU before * calling this function. The function does not return until any executing * interrupts for this IRQ have completed. * * This function must not be called from interrupt context. */ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; chip_bus_lock(desc); kfree(__free_percpu_irq(irq, dev_id)); chip_bus_sync_unlock(desc); } EXPORT_SYMBOL_GPL(free_percpu_irq); void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; if (WARN_ON(!irq_is_nmi(desc))) return; kfree(__free_percpu_irq(irq, dev_id)); } /** * setup_percpu_irq - setup a per-cpu interrupt * @irq: Interrupt line to setup * @act: irqaction for the interrupt * * Used to statically setup per-cpu interrupts in the early boot process. */ int setup_percpu_irq(unsigned int irq, struct irqaction *act) { struct irq_desc *desc = irq_to_desc(irq); int retval; if (!desc || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) return retval; retval = __setup_irq(irq, desc, act); if (retval) irq_chip_pm_put(&desc->irq_data); return retval; } /** * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt on the * local CPU. If the interrupt is supposed to be enabled on other CPUs, it * has to be done on each CPU using enable_percpu_irq(). * * @dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of * that variable. */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (!dev_id) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; if (flags && flags != IRQF_TIMER) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action); } return retval; } EXPORT_SYMBOL_GPL(__request_percpu_irq); /** * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @name: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs * have to be setup on each CPU by calling prepare_percpu_nmi() before * being enabled on the same CPU by using enable_percpu_nmi(). * * @dev_id must be globally unique. It is a per-cpu variable, and the * handler gets called with the interrupted CPU's instance of that * variable. * * Interrupt lines requested for NMI delivering should have auto enabling * setting disabled. * * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc) || irq_settings_can_autoenable(desc) || !irq_supports_nmi(desc)) return -EINVAL; /* The line cannot already be NMI */ if (irq_is_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; scoped_guard(raw_spinlock_irqsave, &desc->lock) desc->istate |= IRQS_NMI; return 0; err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } /** * prepare_percpu_nmi - performs CPU local setup for NMI delivery * @irq: Interrupt line to prepare for NMI delivery * * This call prepares an interrupt line to deliver NMI on the current CPU, * before that interrupt line gets enabled with enable_percpu_nmi(). * * As a CPU local operation, this should be called from non-preemptible * context. * * If the interrupt line cannot be used to deliver NMIs, function will fail * returning a negative value. */ int prepare_percpu_nmi(unsigned int irq) { int ret = -EINVAL; WARN_ON(preemptible()); scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { if (WARN(!irq_is_nmi(scoped_irqdesc), "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq)) return -EINVAL; ret = irq_nmi_setup(scoped_irqdesc); if (ret) pr_err("Failed to setup NMI delivery: irq %u\n", irq); } return ret; } /** * teardown_percpu_nmi - undoes NMI setup of IRQ line * @irq: Interrupt line from which CPU local NMI configuration should be removed * * This call undoes the setup done by prepare_percpu_nmi(). * * IRQ line should not be enabled for the current CPU. * As a CPU local operation, this should be called from non-preemptible * context. */ void teardown_percpu_nmi(unsigned int irq) { WARN_ON(preemptible()); scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { if (WARN_ON(!irq_is_nmi(scoped_irqdesc))) return; irq_nmi_teardown(scoped_irqdesc); } } static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY data = data->parent_data; #else data = NULL; #endif } while (data); if (data) err = chip->irq_get_irqchip_state(data, which, state); return err; } /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: One of IRQCHIP_STATE_* the caller wants to know about * @state: a pointer to a boolean where the state is to be stored * * This call snapshots the internal irqchip state of an interrupt, * returning into @state the bit corresponding to stage @which * * This function should be called with preemption disabled if the interrupt * controller has per-cpu registers. */ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state) { scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); return __irq_get_irqchip_state(data, which, state); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** * irq_set_irqchip_state - set the state of a forwarded interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: State to be restored (one of IRQCHIP_STATE_*) * @val: Value corresponding to @which * * This call sets the internal irqchip state of an interrupt, depending on * the value of @which. * * This function should be called with migration disabled if the interrupt * controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val) { scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); struct irq_chip *chip; do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) return -ENODEV; if (chip->irq_set_irqchip_state) break; data = irqd_get_parent_data(data); } while (data); if (data) return chip->irq_set_irqchip_state(data, which, val); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); /** * irq_has_action - Check whether an interrupt is requested * @irq: The linux irq number * * Returns: A snapshot of the current state */ bool irq_has_action(unsigned int irq) { bool res; rcu_read_lock(); res = irq_desc_has_action(irq_to_desc(irq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_has_action); /** * irq_check_status_bit - Check whether bits in the irq descriptor status are set * @irq: The linux irq number * @bitmask: The bitmask to evaluate * * Returns: True if one of the bits in @bitmask is set */ bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) { struct irq_desc *desc; bool res = false; rcu_read_lock(); desc = irq_to_desc(irq); if (desc) res = !!(desc->status_use_accessors & bitmask); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_check_status_bit); |
| 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_NAPI_H #define IOU_NAPI_H #include <linux/kernel.h> #include <linux/io_uring.h> #include <net/busy_poll.h> #ifdef CONFIG_NET_RX_BUSY_POLL void io_napi_init(struct io_ring_ctx *ctx); void io_napi_free(struct io_ring_ctx *ctx); int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); static inline bool io_napi(struct io_ring_ctx *ctx) { return !list_empty(&ctx->napi_list); } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { if (!io_napi(ctx)) return; __io_napi_busy_loop(ctx, iowq); } /* * io_napi_add() - Add napi id to the busy poll list * @req: pointer to io_kiocb request * * Add the napi id of the socket to the napi busy poll list and hash table. */ static inline void io_napi_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct socket *sock; if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) return; sock = sock_from_file(req->file); if (sock && sock->sk) __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); } #else static inline void io_napi_init(struct io_ring_ctx *ctx) { } static inline void io_napi_free(struct io_ring_ctx *ctx) { } static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { return -EOPNOTSUPP; } static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { return -EOPNOTSUPP; } static inline bool io_napi(struct io_ring_ctx *ctx) { return false; } static inline void io_napi_add(struct io_kiocb *req) { } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { } static inline int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) { return 0; } #endif /* CONFIG_NET_RX_BUSY_POLL */ #endif |
| 59 59 57 58 59 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 | /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #ifndef __DRM_FRAMEBUFFER_H__ #define __DRM_FRAMEBUFFER_H__ #include <linux/bits.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/sched.h> #include <drm/drm_fourcc.h> #include <drm/drm_mode_object.h> struct drm_clip_rect; struct drm_device; struct drm_file; struct drm_framebuffer; struct drm_gem_object; /** * struct drm_framebuffer_funcs - framebuffer hooks */ struct drm_framebuffer_funcs { /** * @destroy: * * Clean up framebuffer resources, specifically also unreference the * backing storage. The core guarantees to call this function for every * framebuffer successfully created by calling * &drm_mode_config_funcs.fb_create. Drivers must also call * drm_framebuffer_cleanup() to release DRM core resources for this * framebuffer. */ void (*destroy)(struct drm_framebuffer *framebuffer); /** * @create_handle: * * Create a buffer handle in the driver-specific buffer manager (either * GEM or TTM) valid for the passed-in &struct drm_file. This is used by * the core to implement the GETFB IOCTL, which returns (for * sufficiently priviledged user) also a native buffer handle. This can * be used for seamless transitions between modesetting clients by * copying the current screen contents to a private buffer and blending * between that and the new contents. * * GEM based drivers should call drm_gem_handle_create() to create the * handle. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*create_handle)(struct drm_framebuffer *fb, struct drm_file *file_priv, unsigned int *handle); /** * @dirty: * * Optional callback for the dirty fb IOCTL. * * Userspace can notify the driver via this callback that an area of the * framebuffer has changed and should be flushed to the display * hardware. This can also be used internally, e.g. by the fbdev * emulation, though that's not the case currently. * * See documentation in drm_mode.h for the struct drm_mode_fb_dirty_cmd * for more information as all the semantics and arguments have a one to * one mapping on this function. * * Atomic drivers should use drm_atomic_helper_dirtyfb() to implement * this hook. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*dirty)(struct drm_framebuffer *framebuffer, struct drm_file *file_priv, unsigned flags, unsigned color, struct drm_clip_rect *clips, unsigned num_clips); }; #define DRM_FRAMEBUFFER_HAS_HANDLE_REF(_i) BIT(0u + (_i)) /** * struct drm_framebuffer - frame buffer object * * Note that the fb is refcounted for the benefit of driver internals, * for example some hw, disabling a CRTC/plane is asynchronous, and * scanout does not actually complete until the next vblank. So some * cleanup (like releasing the reference(s) on the backing GEM bo(s)) * should be deferred. In cases like this, the driver would like to * hold a ref to the fb even though it has already been removed from * userspace perspective. See drm_framebuffer_get() and * drm_framebuffer_put(). * * The refcount is stored inside the mode object @base. */ struct drm_framebuffer { /** * @dev: DRM device this framebuffer belongs to */ struct drm_device *dev; /** * @head: Place on the &drm_mode_config.fb_list, access protected by * &drm_mode_config.fb_lock. */ struct list_head head; /** * @base: base modeset object structure, contains the reference count. */ struct drm_mode_object base; /** * @comm: Name of the process allocating the fb, used for fb dumping. */ char comm[TASK_COMM_LEN]; /** * @format: framebuffer format information */ const struct drm_format_info *format; /** * @funcs: framebuffer vfunc table */ const struct drm_framebuffer_funcs *funcs; /** * @pitches: Line stride per buffer. For userspace created object this * is copied from drm_mode_fb_cmd2. */ unsigned int pitches[DRM_FORMAT_MAX_PLANES]; /** * @offsets: Offset from buffer start to the actual pixel data in bytes, * per buffer. For userspace created object this is copied from * drm_mode_fb_cmd2. * * Note that this is a linear offset and does not take into account * tiling or buffer layout per @modifier. It is meant to be used when * the actual pixel data for this framebuffer plane starts at an offset, * e.g. when multiple planes are allocated within the same backing * storage buffer object. For tiled layouts this generally means its * @offsets must at least be tile-size aligned, but hardware often has * stricter requirements. * * This should not be used to specifiy x/y pixel offsets into the buffer * data (even for linear buffers). Specifying an x/y pixel offset is * instead done through the source rectangle in &struct drm_plane_state. */ unsigned int offsets[DRM_FORMAT_MAX_PLANES]; /** * @modifier: Data layout modifier. This is used to describe * tiling, or also special layouts (like compression) of auxiliary * buffers. For userspace created object this is copied from * drm_mode_fb_cmd2. */ uint64_t modifier; /** * @width: Logical width of the visible area of the framebuffer, in * pixels. */ unsigned int width; /** * @height: Logical height of the visible area of the framebuffer, in * pixels. */ unsigned int height; /** * @flags: Framebuffer flags like DRM_MODE_FB_INTERLACED or * DRM_MODE_FB_MODIFIERS. */ int flags; /** * @internal_flags: Framebuffer flags like DRM_FRAMEBUFFER_HAS_HANDLE_REF. */ unsigned int internal_flags; /** * @filp_head: Placed on &drm_file.fbs, protected by &drm_file.fbs_lock. */ struct list_head filp_head; /** * @obj: GEM objects backing the framebuffer, one per plane (optional). * * This is used by the GEM framebuffer helpers, see e.g. * drm_gem_fb_create(). */ struct drm_gem_object *obj[DRM_FORMAT_MAX_PLANES]; }; #define obj_to_fb(x) container_of(x, struct drm_framebuffer, base) int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb, const struct drm_framebuffer_funcs *funcs); struct drm_framebuffer *drm_framebuffer_lookup(struct drm_device *dev, struct drm_file *file_priv, uint32_t id); void drm_framebuffer_remove(struct drm_framebuffer *fb); void drm_framebuffer_cleanup(struct drm_framebuffer *fb); void drm_framebuffer_unregister_private(struct drm_framebuffer *fb); /** * drm_framebuffer_get - acquire a framebuffer reference * @fb: DRM framebuffer * * This function increments the framebuffer's reference count. */ static inline void drm_framebuffer_get(struct drm_framebuffer *fb) { drm_mode_object_get(&fb->base); } /** * drm_framebuffer_put - release a framebuffer reference * @fb: DRM framebuffer * * This function decrements the framebuffer's reference count and frees the * framebuffer if the reference count drops to zero. */ static inline void drm_framebuffer_put(struct drm_framebuffer *fb) { drm_mode_object_put(&fb->base); } /** * drm_framebuffer_read_refcount - read the framebuffer reference count. * @fb: framebuffer * * This functions returns the framebuffer's reference count. */ static inline uint32_t drm_framebuffer_read_refcount(const struct drm_framebuffer *fb) { return kref_read(&fb->base.refcount); } /** * drm_framebuffer_assign - store a reference to the fb * @p: location to store framebuffer * @fb: new framebuffer (maybe NULL) * * This functions sets the location to store a reference to the framebuffer, * unreferencing the framebuffer that was previously stored in that location. */ static inline void drm_framebuffer_assign(struct drm_framebuffer **p, struct drm_framebuffer *fb) { if (fb) drm_framebuffer_get(fb); if (*p) drm_framebuffer_put(*p); *p = fb; } /* * drm_for_each_fb - iterate over all framebuffers * @fb: the loop cursor * @dev: the DRM device * * Iterate over all framebuffers of @dev. User must hold * &drm_mode_config.fb_lock. */ #define drm_for_each_fb(fb, dev) \ for (WARN_ON(!mutex_is_locked(&(dev)->mode_config.fb_lock)), \ fb = list_first_entry(&(dev)->mode_config.fb_list, \ struct drm_framebuffer, head); \ &fb->head != (&(dev)->mode_config.fb_list); \ fb = list_next_entry(fb, head)) /** * struct drm_afbc_framebuffer - a special afbc frame buffer object * * A derived class of struct drm_framebuffer, dedicated for afbc use cases. */ struct drm_afbc_framebuffer { /** * @base: base framebuffer structure. */ struct drm_framebuffer base; /** * @block_width: width of a single afbc block */ u32 block_width; /** * @block_height: height of a single afbc block */ u32 block_height; /** * @aligned_width: aligned frame buffer width */ u32 aligned_width; /** * @aligned_height: aligned frame buffer height */ u32 aligned_height; /** * @offset: offset of the first afbc header */ u32 offset; /** * @afbc_size: minimum size of afbc buffer */ u32 afbc_size; }; #define fb_to_afbc_fb(x) container_of(x, struct drm_afbc_framebuffer, base) #endif |
| 3 3 3 3 3 3 144 144 144 144 144 144 144 144 144 144 144 144 144 144 144 1 144 144 144 144 144 144 144 144 1 144 140 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 | // SPDX-License-Identifier: GPL-2.0-only /* * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and * Shaohua Li <shli@fb.com> */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/init.h> #include "null_blk.h" #undef pr_fmt #define pr_fmt(fmt) "null_blk: " fmt #define FREE_BATCH 16 #define TICKS_PER_SEC 50ULL #define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static DECLARE_FAULT_ATTR(null_timeout_attr); static DECLARE_FAULT_ATTR(null_requeue_attr); static DECLARE_FAULT_ATTR(null_init_hctx_attr); #endif static inline u64 mb_per_tick(int mbps) { return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); } /* * Status flags for nullb_device. * * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. * UP: Device is currently on and visible in userspace. * THROTTLED: Device is being throttled. * CACHE: Device is using a write-back cache. */ enum nullb_device_flags { NULLB_DEV_FL_CONFIGURED = 0, NULLB_DEV_FL_UP = 1, NULLB_DEV_FL_THROTTLED = 2, NULLB_DEV_FL_CACHE = 3, }; #define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) /* * nullb_page is a page in memory for nullb devices. * * @page: The page holding the data. * @bitmap: The bitmap represents which sector in the page has data. * Each bit represents one block size. For example, sector 8 * will use the 7th bit * The highest 2 bits of bitmap are for special purpose. LOCK means the cache * page is being flushing to storage. FREE means the cache page is freed and * should be skipped from flushing to storage. Please see * null_make_cache_space */ struct nullb_page { struct page *page; DECLARE_BITMAP(bitmap, MAP_SZ); }; #define NULLB_PAGE_LOCK (MAP_SZ - 1) #define NULLB_PAGE_FREE (MAP_SZ - 2) static LIST_HEAD(nullb_list); static struct mutex lock; static int null_major; static DEFINE_IDA(nullb_indexes); static struct blk_mq_tag_set tag_set; enum { NULL_IRQ_NONE = 0, NULL_IRQ_SOFTIRQ = 1, NULL_IRQ_TIMER = 2, }; static bool g_virt_boundary; module_param_named(virt_boundary, g_virt_boundary, bool, 0444); MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False"); static int g_no_sched; module_param_named(no_sched, g_no_sched, int, 0444); MODULE_PARM_DESC(no_sched, "No io scheduler"); static int g_submit_queues = 1; module_param_named(submit_queues, g_submit_queues, int, 0444); MODULE_PARM_DESC(submit_queues, "Number of submission queues"); static int g_poll_queues = 1; module_param_named(poll_queues, g_poll_queues, int, 0444); MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues"); static int g_home_node = NUMA_NO_NODE; module_param_named(home_node, g_home_node, int, 0444); MODULE_PARM_DESC(home_node, "Home node for the device"); #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION /* * For more details about fault injection, please refer to * Documentation/fault-injection/fault-injection.rst. */ static char g_timeout_str[80]; module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>"); static char g_requeue_str[80]; module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>"); static char g_init_hctx_str[80]; module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); #endif /* * Historic queue modes. * * These days nothing but NULL_Q_MQ is actually supported, but we keep it the * enum for error reporting. */ enum { NULL_Q_BIO = 0, NULL_Q_RQ = 1, NULL_Q_MQ = 2, }; static int g_queue_mode = NULL_Q_MQ; static int null_param_store_val(const char *str, int *val, int min, int max) { int ret, new_val; ret = kstrtoint(str, 10, &new_val); if (ret) return -EINVAL; if (new_val < min || new_val > max) return -EINVAL; *val = new_val; return 0; } static int null_set_queue_mode(const char *str, const struct kernel_param *kp) { return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); } static const struct kernel_param_ops null_queue_mode_param_ops = { .set = null_set_queue_mode, .get = param_get_int, }; device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); static int g_gb = 250; module_param_named(gb, g_gb, int, 0444); MODULE_PARM_DESC(gb, "Size in GB"); static int g_bs = 512; module_param_named(bs, g_bs, int, 0444); MODULE_PARM_DESC(bs, "Block size (in bytes)"); static int g_max_sectors; module_param_named(max_sectors, g_max_sectors, int, 0444); MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); static unsigned int nr_devices = 1; module_param(nr_devices, uint, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); static bool g_blocking; module_param_named(blocking, g_blocking, bool, 0444); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); static bool g_shared_tags; module_param_named(shared_tags, g_shared_tags, bool, 0444); MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); static bool g_shared_tag_bitmap; module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); static int g_irqmode = NULL_IRQ_SOFTIRQ; static int null_set_irqmode(const char *str, const struct kernel_param *kp) { return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, NULL_IRQ_TIMER); } static const struct kernel_param_ops null_irqmode_param_ops = { .set = null_set_irqmode, .get = param_get_int, }; device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); static unsigned long g_completion_nsec = 10000; module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); static int g_hw_queue_depth = 64; module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); static bool g_use_per_node_hctx; module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); static bool g_memory_backed; module_param_named(memory_backed, g_memory_backed, bool, 0444); MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false"); static bool g_discard; module_param_named(discard, g_discard, bool, 0444); MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false"); static unsigned long g_cache_size; module_param_named(cache_size, g_cache_size, ulong, 0444); MODULE_PARM_DESC(cache_size, "Cache size in MiB for memory-backed device. Default: 0 (none)"); static bool g_fua = true; module_param_named(fua, g_fua, bool, 0444); MODULE_PARM_DESC(fua, "Enable/disable FUA support when cache_size is used. Default: true"); static unsigned int g_mbps; module_param_named(mbps, g_mbps, uint, 0444); MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); static bool g_zoned; module_param_named(zoned, g_zoned, bool, S_IRUGO); MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); static unsigned long g_zone_size = 256; module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); static unsigned long g_zone_capacity; module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); static unsigned int g_zone_nr_conv; module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); static unsigned int g_zone_max_open; module_param_named(zone_max_open, g_zone_max_open, uint, 0444); MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); static unsigned int g_zone_max_active; module_param_named(zone_max_active, g_zone_max_active, uint, 0444); MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); static int g_zone_append_max_sectors = INT_MAX; module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444); MODULE_PARM_DESC(zone_append_max_sectors, "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation"); static bool g_zone_full; module_param_named(zone_full, g_zone_full, bool, S_IRUGO); MODULE_PARM_DESC(zone_full, "Initialize the sequential write required zones of a zoned device to be full. Default: false"); static bool g_rotational; module_param_named(rotational, g_rotational, bool, S_IRUGO); MODULE_PARM_DESC(rotational, "Set the rotational feature for the device. Default: false"); static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); static int null_add_dev(struct nullb_device *dev); static struct nullb *null_find_dev_by_name(const char *name); static void null_free_device_storage(struct nullb_device *dev, bool is_cache); static inline struct nullb_device *to_nullb_device(struct config_item *item) { return item ? container_of(to_config_group(item), struct nullb_device, group) : NULL; } static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) { return snprintf(page, PAGE_SIZE, "%u\n", val); } static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, char *page) { return snprintf(page, PAGE_SIZE, "%lu\n", val); } static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) { return snprintf(page, PAGE_SIZE, "%u\n", val); } static ssize_t nullb_device_uint_attr_store(unsigned int *val, const char *page, size_t count) { unsigned int tmp; int result; result = kstrtouint(page, 0, &tmp); if (result < 0) return result; *val = tmp; return count; } static ssize_t nullb_device_ulong_attr_store(unsigned long *val, const char *page, size_t count) { int result; unsigned long tmp; result = kstrtoul(page, 0, &tmp); if (result < 0) return result; *val = tmp; return count; } static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, size_t count) { bool tmp; int result; result = kstrtobool(page, &tmp); if (result < 0) return result; *val = tmp; return count; } /* The following macro should only be used with TYPE = {uint, ulong, bool}. */ #define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ static ssize_t \ nullb_device_##NAME##_show(struct config_item *item, char *page) \ { \ return nullb_device_##TYPE##_attr_show( \ to_nullb_device(item)->NAME, page); \ } \ static ssize_t \ nullb_device_##NAME##_store(struct config_item *item, const char *page, \ size_t count) \ { \ int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ struct nullb_device *dev = to_nullb_device(item); \ TYPE new_value = 0; \ int ret; \ \ ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ if (ret < 0) \ return ret; \ if (apply_fn) \ ret = apply_fn(dev, new_value); \ else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ ret = -EBUSY; \ if (ret < 0) \ return ret; \ dev->NAME = new_value; \ return count; \ } \ CONFIGFS_ATTR(nullb_device_, NAME); static int nullb_update_nr_hw_queues(struct nullb_device *dev, unsigned int submit_queues, unsigned int poll_queues) { struct blk_mq_tag_set *set; int ret, nr_hw_queues; if (!dev->nullb) return 0; /* * Make sure at least one submit queue exists. */ if (!submit_queues) return -EINVAL; /* * Make sure that null_init_hctx() does not access nullb->queues[] past * the end of that array. */ if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues) return -EINVAL; /* * Keep previous and new queue numbers in nullb_device for reference in * the call back function null_map_queues(). */ dev->prev_submit_queues = dev->submit_queues; dev->prev_poll_queues = dev->poll_queues; dev->submit_queues = submit_queues; dev->poll_queues = poll_queues; set = dev->nullb->tag_set; nr_hw_queues = submit_queues + poll_queues; blk_mq_update_nr_hw_queues(set, nr_hw_queues); ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM; if (ret) { /* on error, revert the queue numbers */ dev->submit_queues = dev->prev_submit_queues; dev->poll_queues = dev->prev_poll_queues; } return ret; } static int nullb_apply_submit_queues(struct nullb_device *dev, unsigned int submit_queues) { int ret; mutex_lock(&lock); ret = nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues); mutex_unlock(&lock); return ret; } static int nullb_apply_poll_queues(struct nullb_device *dev, unsigned int poll_queues) { int ret; mutex_lock(&lock); ret = nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues); mutex_unlock(&lock); return ret; } NULLB_DEVICE_ATTR(size, ulong, NULL); NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues); NULLB_DEVICE_ATTR(home_node, uint, NULL); NULLB_DEVICE_ATTR(queue_mode, uint, NULL); NULLB_DEVICE_ATTR(blocksize, uint, NULL); NULLB_DEVICE_ATTR(max_sectors, uint, NULL); NULLB_DEVICE_ATTR(irqmode, uint, NULL); NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); NULLB_DEVICE_ATTR(index, uint, NULL); NULLB_DEVICE_ATTR(blocking, bool, NULL); NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); NULLB_DEVICE_ATTR(memory_backed, bool, NULL); NULLB_DEVICE_ATTR(discard, bool, NULL); NULLB_DEVICE_ATTR(mbps, uint, NULL); NULLB_DEVICE_ATTR(cache_size, ulong, NULL); NULLB_DEVICE_ATTR(zoned, bool, NULL); NULLB_DEVICE_ATTR(zone_size, ulong, NULL); NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL); NULLB_DEVICE_ATTR(zone_full, bool, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL); NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); NULLB_DEVICE_ATTR(fua, bool, NULL); NULLB_DEVICE_ATTR(rotational, bool, NULL); NULLB_DEVICE_ATTR(badblocks_once, bool, NULL); NULLB_DEVICE_ATTR(badblocks_partial_io, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); } static ssize_t nullb_device_power_store(struct config_item *item, const char *page, size_t count) { struct nullb_device *dev = to_nullb_device(item); bool newp = false; ssize_t ret; ret = nullb_device_bool_attr_store(&newp, page, count); if (ret < 0) return ret; ret = count; mutex_lock(&lock); if (!dev->power && newp) { if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) goto out; ret = null_add_dev(dev); if (ret) { clear_bit(NULLB_DEV_FL_UP, &dev->flags); goto out; } set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; ret = count; } else if (dev->power && !newp) { if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { dev->power = newp; null_del_dev(dev->nullb); } clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); } out: mutex_unlock(&lock); return ret; } CONFIGFS_ATTR(nullb_device_, power); static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) { struct nullb_device *t_dev = to_nullb_device(item); return badblocks_show(&t_dev->badblocks, page, 0); } static ssize_t nullb_device_badblocks_store(struct config_item *item, const char *page, size_t count) { struct nullb_device *t_dev = to_nullb_device(item); char *orig, *buf, *tmp; u64 start, end; int ret; orig = kstrndup(page, count, GFP_KERNEL); if (!orig) return -ENOMEM; buf = strstrip(orig); ret = -EINVAL; if (buf[0] != '+' && buf[0] != '-') goto out; tmp = strchr(&buf[1], '-'); if (!tmp) goto out; *tmp = '\0'; ret = kstrtoull(buf + 1, 0, &start); if (ret) goto out; ret = kstrtoull(tmp + 1, 0, &end); if (ret) goto out; ret = -EINVAL; if (start > end) goto out; /* enable badblocks */ cmpxchg(&t_dev->badblocks.shift, -1, 0); if (buf[0] == '+') { if (badblocks_set(&t_dev->badblocks, start, end - start + 1, 1)) ret = count; } else if (badblocks_clear(&t_dev->badblocks, start, end - start + 1)) { ret = count; } out: kfree(orig); return ret; } CONFIGFS_ATTR(nullb_device_, badblocks); static ssize_t nullb_device_zone_readonly_store(struct config_item *item, const char *page, size_t count) { struct nullb_device *dev = to_nullb_device(item); return zone_cond_store(dev, page, count, BLK_ZONE_COND_READONLY); } CONFIGFS_ATTR_WO(nullb_device_, zone_readonly); static ssize_t nullb_device_zone_offline_store(struct config_item *item, const char *page, size_t count) { struct nullb_device *dev = to_nullb_device(item); return zone_cond_store(dev, page, count, BLK_ZONE_COND_OFFLINE); } CONFIGFS_ATTR_WO(nullb_device_, zone_offline); static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_badblocks, &nullb_device_attr_badblocks_once, &nullb_device_attr_badblocks_partial_io, &nullb_device_attr_blocking, &nullb_device_attr_blocksize, &nullb_device_attr_cache_size, &nullb_device_attr_completion_nsec, &nullb_device_attr_discard, &nullb_device_attr_fua, &nullb_device_attr_home_node, &nullb_device_attr_hw_queue_depth, &nullb_device_attr_index, &nullb_device_attr_irqmode, &nullb_device_attr_max_sectors, &nullb_device_attr_mbps, &nullb_device_attr_memory_backed, &nullb_device_attr_no_sched, &nullb_device_attr_poll_queues, &nullb_device_attr_power, &nullb_device_attr_queue_mode, &nullb_device_attr_rotational, &nullb_device_attr_shared_tag_bitmap, &nullb_device_attr_shared_tags, &nullb_device_attr_size, &nullb_device_attr_submit_queues, &nullb_device_attr_use_per_node_hctx, &nullb_device_attr_virt_boundary, &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_capacity, &nullb_device_attr_zone_full, &nullb_device_attr_zone_max_active, &nullb_device_attr_zone_max_open, &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_offline, &nullb_device_attr_zone_readonly, &nullb_device_attr_zone_size, &nullb_device_attr_zoned, NULL, }; static void nullb_device_release(struct config_item *item) { struct nullb_device *dev = to_nullb_device(item); null_free_device_storage(dev, false); null_free_dev(dev); } static struct configfs_item_operations nullb_device_ops = { .release = nullb_device_release, }; static const struct config_item_type nullb_device_type = { .ct_item_ops = &nullb_device_ops, .ct_attrs = nullb_device_attrs, .ct_owner = THIS_MODULE, }; #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static void nullb_add_fault_config(struct nullb_device *dev) { fault_config_init(&dev->timeout_config, "timeout_inject"); fault_config_init(&dev->requeue_config, "requeue_inject"); fault_config_init(&dev->init_hctx_fault_config, "init_hctx_fault_inject"); configfs_add_default_group(&dev->timeout_config.group, &dev->group); configfs_add_default_group(&dev->requeue_config.group, &dev->group); configfs_add_default_group(&dev->init_hctx_fault_config.group, &dev->group); } #else static void nullb_add_fault_config(struct nullb_device *dev) { } #endif static struct config_group *nullb_group_make_group(struct config_group *group, const char *name) { struct nullb_device *dev; if (null_find_dev_by_name(name)) return ERR_PTR(-EEXIST); dev = null_alloc_dev(); if (!dev) return ERR_PTR(-ENOMEM); config_group_init_type_name(&dev->group, name, &nullb_device_type); nullb_add_fault_config(dev); return &dev->group; } static void nullb_group_drop_item(struct config_group *group, struct config_item *item) { struct nullb_device *dev = to_nullb_device(item); if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { mutex_lock(&lock); dev->power = false; null_del_dev(dev->nullb); mutex_unlock(&lock); } config_item_put(item); } static ssize_t memb_group_features_show(struct config_item *item, char *page) { struct configfs_attribute **entry; char delimiter = ','; size_t left = PAGE_SIZE; size_t written = 0; int ret; for (entry = &nullb_device_attrs[0]; *entry && left > 0; entry++) { if (!*(entry + 1)) delimiter = '\n'; ret = snprintf(page + written, left, "%s%c", (*entry)->ca_name, delimiter); if (ret >= left) { WARN_ONCE(1, "Too many null_blk features to print\n"); memzero_explicit(page, PAGE_SIZE); return -ENOBUFS; } left -= ret; written += ret; } return written; } CONFIGFS_ATTR_RO(memb_group_, features); static struct configfs_attribute *nullb_group_attrs[] = { &memb_group_attr_features, NULL, }; static struct configfs_group_operations nullb_group_ops = { .make_group = nullb_group_make_group, .drop_item = nullb_group_drop_item, }; static const struct config_item_type nullb_group_type = { .ct_group_ops = &nullb_group_ops, .ct_attrs = nullb_group_attrs, .ct_owner = THIS_MODULE, }; static struct configfs_subsystem nullb_subsys = { .su_group = { .cg_item = { .ci_namebuf = "nullb", .ci_type = &nullb_group_type, }, }, }; static inline int null_cache_active(struct nullb *nullb) { return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); } static struct nullb_device *null_alloc_dev(void) { struct nullb_device *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION dev->timeout_config.attr = null_timeout_attr; dev->requeue_config.attr = null_requeue_attr; dev->init_hctx_fault_config.attr = null_init_hctx_attr; #endif INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); if (badblocks_init(&dev->badblocks, 0)) { kfree(dev); return NULL; } dev->size = g_gb * 1024; dev->completion_nsec = g_completion_nsec; dev->submit_queues = g_submit_queues; dev->prev_submit_queues = g_submit_queues; dev->poll_queues = g_poll_queues; dev->prev_poll_queues = g_poll_queues; dev->home_node = g_home_node; dev->queue_mode = g_queue_mode; dev->blocksize = g_bs; dev->max_sectors = g_max_sectors; dev->irqmode = g_irqmode; dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; dev->memory_backed = g_memory_backed; dev->discard = g_discard; dev->cache_size = g_cache_size; dev->mbps = g_mbps; dev->use_per_node_hctx = g_use_per_node_hctx; dev->zoned = g_zoned; dev->zone_size = g_zone_size; dev->zone_capacity = g_zone_capacity; dev->zone_nr_conv = g_zone_nr_conv; dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; dev->zone_append_max_sectors = g_zone_append_max_sectors; dev->zone_full = g_zone_full; dev->virt_boundary = g_virt_boundary; dev->no_sched = g_no_sched; dev->shared_tags = g_shared_tags; dev->shared_tag_bitmap = g_shared_tag_bitmap; dev->fua = g_fua; dev->rotational = g_rotational; return dev; } static void null_free_dev(struct nullb_device *dev) { if (!dev) return; null_free_zoned_dev(dev); badblocks_exit(&dev->badblocks); kfree(dev); } static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) { struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer); blk_mq_end_request(blk_mq_rq_from_pdu(cmd), cmd->error); return HRTIMER_NORESTART; } static void null_cmd_end_timer(struct nullb_cmd *cmd) { ktime_t kt = cmd->nq->dev->completion_nsec; hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); } static void null_complete_rq(struct request *rq) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); blk_mq_end_request(rq, cmd->error); } static struct nullb_page *null_alloc_page(void) { struct nullb_page *t_page; t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO); if (!t_page) return NULL; t_page->page = alloc_pages(GFP_NOIO, 0); if (!t_page->page) { kfree(t_page); return NULL; } memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); return t_page; } static void null_free_page(struct nullb_page *t_page) { __set_bit(NULLB_PAGE_FREE, t_page->bitmap); if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) return; __free_page(t_page->page); kfree(t_page); } static bool null_page_empty(struct nullb_page *page) { int size = MAP_SZ - 2; return find_first_bit(page->bitmap, size) == size; } static void null_free_sector(struct nullb *nullb, sector_t sector, bool is_cache) { unsigned int sector_bit; u64 idx; struct nullb_page *t_page, *ret; struct radix_tree_root *root; root = is_cache ? &nullb->dev->cache : &nullb->dev->data; idx = sector >> PAGE_SECTORS_SHIFT; sector_bit = (sector & SECTOR_MASK); t_page = radix_tree_lookup(root, idx); if (t_page) { __clear_bit(sector_bit, t_page->bitmap); if (null_page_empty(t_page)) { ret = radix_tree_delete_item(root, idx, t_page); WARN_ON(ret != t_page); null_free_page(ret); if (is_cache) nullb->dev->curr_cache -= PAGE_SIZE; } } } static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, struct nullb_page *t_page, bool is_cache) { struct radix_tree_root *root; root = is_cache ? &nullb->dev->cache : &nullb->dev->data; if (radix_tree_insert(root, idx, t_page)) { null_free_page(t_page); t_page = radix_tree_lookup(root, idx); WARN_ON(!t_page || t_page->page->private != idx); } else if (is_cache) nullb->dev->curr_cache += PAGE_SIZE; return t_page; } static void null_free_device_storage(struct nullb_device *dev, bool is_cache) { unsigned long pos = 0; int nr_pages; struct nullb_page *ret, *t_pages[FREE_BATCH]; struct radix_tree_root *root; root = is_cache ? &dev->cache : &dev->data; do { int i; nr_pages = radix_tree_gang_lookup(root, (void **)t_pages, pos, FREE_BATCH); for (i = 0; i < nr_pages; i++) { pos = t_pages[i]->page->private; ret = radix_tree_delete_item(root, pos, t_pages[i]); WARN_ON(ret != t_pages[i]); null_free_page(ret); } pos++; } while (nr_pages == FREE_BATCH); if (is_cache) dev->curr_cache = 0; } static struct nullb_page *__null_lookup_page(struct nullb *nullb, sector_t sector, bool for_write, bool is_cache) { unsigned int sector_bit; u64 idx; struct nullb_page *t_page; struct radix_tree_root *root; idx = sector >> PAGE_SECTORS_SHIFT; sector_bit = (sector & SECTOR_MASK); root = is_cache ? &nullb->dev->cache : &nullb->dev->data; t_page = radix_tree_lookup(root, idx); WARN_ON(t_page && t_page->page->private != idx); if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) return t_page; return NULL; } static struct nullb_page *null_lookup_page(struct nullb *nullb, sector_t sector, bool for_write, bool ignore_cache) { struct nullb_page *page = NULL; if (!ignore_cache) page = __null_lookup_page(nullb, sector, for_write, true); if (page) return page; return __null_lookup_page(nullb, sector, for_write, false); } static struct nullb_page *null_insert_page(struct nullb *nullb, sector_t sector, bool ignore_cache) __releases(&nullb->lock) __acquires(&nullb->lock) { u64 idx; struct nullb_page *t_page; t_page = null_lookup_page(nullb, sector, true, ignore_cache); if (t_page) return t_page; spin_unlock_irq(&nullb->lock); t_page = null_alloc_page(); if (!t_page) goto out_lock; if (radix_tree_preload(GFP_NOIO)) goto out_freepage; spin_lock_irq(&nullb->lock); idx = sector >> PAGE_SECTORS_SHIFT; t_page->page->private = idx; t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); radix_tree_preload_end(); return t_page; out_freepage: null_free_page(t_page); out_lock: spin_lock_irq(&nullb->lock); return null_lookup_page(nullb, sector, true, ignore_cache); } static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) { int i; unsigned int offset; u64 idx; struct nullb_page *t_page, *ret; void *dst, *src; idx = c_page->page->private; t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { null_free_page(c_page); if (t_page && null_page_empty(t_page)) { ret = radix_tree_delete_item(&nullb->dev->data, idx, t_page); null_free_page(t_page); } return 0; } if (!t_page) return -ENOMEM; src = kmap_local_page(c_page->page); dst = kmap_local_page(t_page->page); for (i = 0; i < PAGE_SECTORS; i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { if (test_bit(i, c_page->bitmap)) { offset = (i << SECTOR_SHIFT); memcpy(dst + offset, src + offset, nullb->dev->blocksize); __set_bit(i, t_page->bitmap); } } kunmap_local(dst); kunmap_local(src); ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); null_free_page(ret); nullb->dev->curr_cache -= PAGE_SIZE; return 0; } static int null_make_cache_space(struct nullb *nullb, unsigned long n) { int i, err, nr_pages; struct nullb_page *c_pages[FREE_BATCH]; unsigned long flushed = 0, one_round; again: if ((nullb->dev->cache_size * 1024 * 1024) > nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) return 0; nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); /* * nullb_flush_cache_page could unlock before using the c_pages. To * avoid race, we don't allow page free */ for (i = 0; i < nr_pages; i++) { nullb->cache_flush_pos = c_pages[i]->page->private; /* * We found the page which is being flushed to disk by other * threads */ if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) c_pages[i] = NULL; else __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); } one_round = 0; for (i = 0; i < nr_pages; i++) { if (c_pages[i] == NULL) continue; err = null_flush_cache_page(nullb, c_pages[i]); if (err) return err; one_round++; } flushed += one_round << PAGE_SHIFT; if (n > flushed) { if (nr_pages == 0) nullb->cache_flush_pos = 0; if (one_round == 0) { /* give other threads a chance */ spin_unlock_irq(&nullb->lock); spin_lock_irq(&nullb->lock); } goto again; } return 0; } static int copy_to_nullb(struct nullb *nullb, struct page *source, unsigned int off, sector_t sector, size_t n, bool is_fua) { size_t temp, count = 0; unsigned int offset; struct nullb_page *t_page; while (count < n) { temp = min_t(size_t, nullb->dev->blocksize, n - count); if (null_cache_active(nullb) && !is_fua) null_make_cache_space(nullb, PAGE_SIZE); offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; t_page = null_insert_page(nullb, sector, !null_cache_active(nullb) || is_fua); if (!t_page) return -ENOSPC; memcpy_page(t_page->page, offset, source, off + count, temp); __set_bit(sector & SECTOR_MASK, t_page->bitmap); if (is_fua) null_free_sector(nullb, sector, true); count += temp; sector += temp >> SECTOR_SHIFT; } return 0; } static int copy_from_nullb(struct nullb *nullb, struct page *dest, unsigned int off, sector_t sector, size_t n) { size_t temp, count = 0; unsigned int offset; struct nullb_page *t_page; while (count < n) { temp = min_t(size_t, nullb->dev->blocksize, n - count); offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; t_page = null_lookup_page(nullb, sector, false, !null_cache_active(nullb)); if (t_page) memcpy_page(dest, off + count, t_page->page, offset, temp); else memzero_page(dest, off + count, temp); count += temp; sector += temp >> SECTOR_SHIFT; } return 0; } static void nullb_fill_pattern(struct nullb *nullb, struct page *page, unsigned int len, unsigned int off) { memset_page(page, off, 0xff, len); } blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, sector_t nr_sectors) { struct nullb *nullb = dev->nullb; size_t n = nr_sectors << SECTOR_SHIFT; size_t temp; spin_lock_irq(&nullb->lock); while (n > 0) { temp = min_t(size_t, n, dev->blocksize); null_free_sector(nullb, sector, false); if (null_cache_active(nullb)) null_free_sector(nullb, sector, true); sector += temp >> SECTOR_SHIFT; n -= temp; } spin_unlock_irq(&nullb->lock); return BLK_STS_OK; } static blk_status_t null_handle_flush(struct nullb *nullb) { int err; if (!null_cache_active(nullb)) return 0; spin_lock_irq(&nullb->lock); while (true) { err = null_make_cache_space(nullb, nullb->dev->cache_size * 1024 * 1024); if (err || nullb->dev->curr_cache == 0) break; } WARN_ON(!radix_tree_empty(&nullb->dev->cache)); spin_unlock_irq(&nullb->lock); return errno_to_blk_status(err); } static int null_transfer(struct nullb *nullb, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector, bool is_fua) { struct nullb_device *dev = nullb->dev; unsigned int valid_len = len; int err = 0; if (!is_write) { if (dev->zoned) valid_len = null_zone_valid_read_len(nullb, sector, len); if (valid_len) { err = copy_from_nullb(nullb, page, off, sector, valid_len); off += valid_len; len -= valid_len; } if (len) nullb_fill_pattern(nullb, page, len, off); flush_dcache_page(page); } else { flush_dcache_page(page); err = copy_to_nullb(nullb, page, off, sector, len, is_fua); } return err; } /* * Transfer data for the given request. The transfer size is capped with the * nr_sectors argument. */ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, sector_t nr_sectors) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; int err = 0; unsigned int len; sector_t sector = blk_rq_pos(rq); unsigned int max_bytes = nr_sectors << SECTOR_SHIFT; unsigned int transferred_bytes = 0; struct req_iterator iter; struct bio_vec bvec; spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; if (transferred_bytes + len > max_bytes) len = max_bytes - transferred_bytes; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(req_op(rq)), sector, rq->cmd_flags & REQ_FUA); if (err) break; sector += len >> SECTOR_SHIFT; transferred_bytes += len; if (transferred_bytes >= max_bytes) break; } spin_unlock_irq(&nullb->lock); return errno_to_blk_status(err); } static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; blk_status_t sts = BLK_STS_OK; struct request *rq = blk_mq_rq_from_pdu(cmd); if (!hrtimer_active(&nullb->bw_timer)) hrtimer_restart(&nullb->bw_timer); if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { blk_mq_stop_hw_queues(nullb->q); /* race with timer */ if (atomic_long_read(&nullb->cur_bytes) > 0) blk_mq_start_stopped_hw_queues(nullb->q, true); /* requeue request */ sts = BLK_STS_DEV_RESOURCE; } return sts; } /* * Check if the command should fail for the badblocks. If so, return * BLK_STS_IOERR and return number of partial I/O sectors to be written or read, * which may be less than the requested number of sectors. * * @cmd: The command to handle. * @sector: The start sector for I/O. * @nr_sectors: Specifies number of sectors to write or read, and returns the * number of sectors to be written or read. */ blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, unsigned int *nr_sectors) { struct badblocks *bb = &cmd->nq->dev->badblocks; struct nullb_device *dev = cmd->nq->dev; unsigned int block_sectors = dev->blocksize >> SECTOR_SHIFT; sector_t first_bad, bad_sectors; unsigned int partial_io_sectors = 0; if (!badblocks_check(bb, sector, *nr_sectors, &first_bad, &bad_sectors)) return BLK_STS_OK; if (cmd->nq->dev->badblocks_once) badblocks_clear(bb, first_bad, bad_sectors); if (cmd->nq->dev->badblocks_partial_io) { if (!IS_ALIGNED(first_bad, block_sectors)) first_bad = ALIGN_DOWN(first_bad, block_sectors); if (sector < first_bad) partial_io_sectors = first_bad - sector; } *nr_sectors = partial_io_sectors; return BLK_STS_IOERR; } blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op, sector_t sector, sector_t nr_sectors) { struct nullb_device *dev = cmd->nq->dev; if (op == REQ_OP_DISCARD) return null_handle_discard(dev, sector, nr_sectors); return null_handle_data_transfer(cmd, nr_sectors); } static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb_device *dev = cmd->nq->dev; struct bio *bio; if (!dev->memory_backed && req_op(rq) == REQ_OP_READ) { __rq_for_each_bio(bio, rq) zero_fill_bio(bio); } } static inline void nullb_complete_cmd(struct nullb_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); /* * Since root privileges are required to configure the null_blk * driver, it is fine that this driver does not initialize the * data buffers of read commands. Zero-initialize these buffers * anyway if KMSAN is enabled to prevent that KMSAN complains * about null_blk not initializing read data buffers. */ if (IS_ENABLED(CONFIG_KMSAN)) nullb_zero_read_cmd_buffer(cmd); /* Complete IO by inline, softirq or timer */ switch (cmd->nq->dev->irqmode) { case NULL_IRQ_SOFTIRQ: blk_mq_complete_request(rq); break; case NULL_IRQ_NONE: blk_mq_end_request(rq, cmd->error); break; case NULL_IRQ_TIMER: null_cmd_end_timer(cmd); break; } } blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors) { struct nullb_device *dev = cmd->nq->dev; blk_status_t badblocks_ret = BLK_STS_OK; blk_status_t ret; if (dev->badblocks.shift != -1) badblocks_ret = null_handle_badblocks(cmd, sector, &nr_sectors); if (dev->memory_backed && nr_sectors) { ret = null_handle_memory_backed(cmd, op, sector, nr_sectors); if (ret != BLK_STS_OK) return ret; } return badblocks_ret; } static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, sector_t nr_sectors, enum req_op op) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; blk_status_t sts; if (op == REQ_OP_FLUSH) { cmd->error = null_handle_flush(nullb); goto out; } if (dev->zoned) sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors); else sts = null_process_cmd(cmd, op, sector, nr_sectors); /* Do not overwrite errors (e.g. timeout errors) */ if (cmd->error == BLK_STS_OK) cmd->error = sts; out: nullb_complete_cmd(cmd); } static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) { struct nullb *nullb = container_of(timer, struct nullb, bw_timer); ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); unsigned int mbps = nullb->dev->mbps; if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) return HRTIMER_NORESTART; atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); blk_mq_start_stopped_hw_queues(nullb->q, true); hrtimer_forward_now(&nullb->bw_timer, timer_interval); return HRTIMER_RESTART; } static void nullb_setup_bwtimer(struct nullb *nullb) { ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); hrtimer_setup(&nullb->bw_timer, nullb_bwtimer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); } #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static bool should_timeout_request(struct request *rq) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); struct nullb_device *dev = cmd->nq->dev; return should_fail(&dev->timeout_config.attr, 1); } static bool should_requeue_request(struct request *rq) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); struct nullb_device *dev = cmd->nq->dev; return should_fail(&dev->requeue_config.attr, 1); } static bool should_init_hctx_fail(struct nullb_device *dev) { return should_fail(&dev->init_hctx_fault_config.attr, 1); } #else static bool should_timeout_request(struct request *rq) { return false; } static bool should_requeue_request(struct request *rq) { return false; } static bool should_init_hctx_fail(struct nullb_device *dev) { return false; } #endif static void null_map_queues(struct blk_mq_tag_set *set) { struct nullb *nullb = set->driver_data; int i, qoff; unsigned int submit_queues = g_submit_queues; unsigned int poll_queues = g_poll_queues; if (nullb) { struct nullb_device *dev = nullb->dev; /* * Refer nr_hw_queues of the tag set to check if the expected * number of hardware queues are prepared. If block layer failed * to prepare them, use previous numbers of submit queues and * poll queues to map queues. */ if (set->nr_hw_queues == dev->submit_queues + dev->poll_queues) { submit_queues = dev->submit_queues; poll_queues = dev->poll_queues; } else if (set->nr_hw_queues == dev->prev_submit_queues + dev->prev_poll_queues) { submit_queues = dev->prev_submit_queues; poll_queues = dev->prev_poll_queues; } else { pr_warn("tag set has unexpected nr_hw_queues: %d\n", set->nr_hw_queues); WARN_ON_ONCE(true); submit_queues = 1; poll_queues = 0; } } for (i = 0, qoff = 0; i < set->nr_maps; i++) { struct blk_mq_queue_map *map = &set->map[i]; switch (i) { case HCTX_TYPE_DEFAULT: map->nr_queues = submit_queues; break; case HCTX_TYPE_READ: map->nr_queues = 0; continue; case HCTX_TYPE_POLL: map->nr_queues = poll_queues; break; } map->queue_offset = qoff; qoff += map->nr_queues; blk_mq_map_queues(map); } } static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nullb_queue *nq = hctx->driver_data; LIST_HEAD(list); int nr = 0; struct request *rq; spin_lock(&nq->poll_lock); list_splice_init(&nq->poll_list, &list); list_for_each_entry(rq, &list, queuelist) blk_mq_set_request_complete(rq); spin_unlock(&nq->poll_lock); while (!list_empty(&list)) { struct nullb_cmd *cmd; struct request *req; req = list_first_entry(&list, struct request, queuelist); list_del_init(&req->queuelist); cmd = blk_mq_rq_to_pdu(req); cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req), blk_rq_sectors(req)); if (!blk_mq_add_to_batch(req, iob, cmd->error != BLK_STS_OK, blk_mq_end_request_batch)) blk_mq_end_request(req, cmd->error); nr++; } return nr; } static enum blk_eh_timer_return null_timeout_rq(struct request *rq) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); if (hctx->type == HCTX_TYPE_POLL) { struct nullb_queue *nq = hctx->driver_data; spin_lock(&nq->poll_lock); /* The request may have completed meanwhile. */ if (blk_mq_request_completed(rq)) { spin_unlock(&nq->poll_lock); return BLK_EH_DONE; } list_del_init(&rq->queuelist); spin_unlock(&nq->poll_lock); } pr_info("rq %p timed out\n", rq); /* * If the device is marked as blocking (i.e. memory backed or zoned * device), the submission path may be blocked waiting for resources * and cause real timeouts. For these real timeouts, the submission * path will complete the request using blk_mq_complete_request(). * Only fake timeouts need to execute blk_mq_complete_request() here. */ cmd->error = BLK_STS_TIMEOUT; if (cmd->fake_timeout || hctx->type == HCTX_TYPE_POLL) blk_mq_complete_request(rq); return BLK_EH_DONE; } static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); struct nullb_queue *nq = hctx->driver_data; sector_t nr_sectors = blk_rq_sectors(rq); sector_t sector = blk_rq_pos(rq); const bool is_poll = hctx->type == HCTX_TYPE_POLL; might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) { hrtimer_setup(&cmd->timer, null_cmd_timer_expired, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } cmd->error = BLK_STS_OK; cmd->nq = nq; cmd->fake_timeout = should_timeout_request(rq) || blk_should_fake_timeout(rq->q); if (should_requeue_request(rq)) { /* * Alternate between hitting the core BUSY path, and the * driver driven requeue path */ nq->requeue_selection++; if (nq->requeue_selection & 1) return BLK_STS_RESOURCE; blk_mq_requeue_request(rq, true); return BLK_STS_OK; } if (test_bit(NULLB_DEV_FL_THROTTLED, &nq->dev->flags)) { blk_status_t sts = null_handle_throttled(cmd); if (sts != BLK_STS_OK) return sts; } blk_mq_start_request(rq); if (is_poll) { spin_lock(&nq->poll_lock); list_add_tail(&rq->queuelist, &nq->poll_list); spin_unlock(&nq->poll_lock); return BLK_STS_OK; } if (cmd->fake_timeout) return BLK_STS_OK; null_handle_cmd(cmd, sector, nr_sectors, req_op(rq)); return BLK_STS_OK; } static void null_queue_rqs(struct rq_list *rqlist) { struct rq_list requeue_list = {}; struct blk_mq_queue_data bd = { }; blk_status_t ret; do { struct request *rq = rq_list_pop(rqlist); bd.rq = rq; ret = null_queue_rq(rq->mq_hctx, &bd); if (ret != BLK_STS_OK) rq_list_add_tail(&requeue_list, rq); } while (!rq_list_empty(rqlist)); *rqlist = requeue_list; } static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) { nq->dev = nullb->dev; INIT_LIST_HEAD(&nq->poll_list); spin_lock_init(&nq->poll_lock); } static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { struct nullb *nullb = hctx->queue->queuedata; struct nullb_queue *nq; if (should_init_hctx_fail(nullb->dev)) return -EFAULT; nq = &nullb->queues[hctx_idx]; hctx->driver_data = nq; null_init_queue(nullb, nq); return 0; } static const struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, .queue_rqs = null_queue_rqs, .complete = null_complete_rq, .timeout = null_timeout_rq, .poll = null_poll, .map_queues = null_map_queues, .init_hctx = null_init_hctx, }; static void null_del_dev(struct nullb *nullb) { struct nullb_device *dev; if (!nullb) return; dev = nullb->dev; ida_free(&nullb_indexes, nullb->index); list_del_init(&nullb->list); del_gendisk(nullb->disk); if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { hrtimer_cancel(&nullb->bw_timer); atomic_long_set(&nullb->cur_bytes, LONG_MAX); blk_mq_start_stopped_hw_queues(nullb->q, true); } put_disk(nullb->disk); if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); kfree(nullb->queues); if (null_cache_active(nullb)) null_free_device_storage(nullb->dev, true); kfree(nullb); dev->nullb = NULL; } static void null_config_discard(struct nullb *nullb, struct queue_limits *lim) { if (nullb->dev->discard == false) return; if (!nullb->dev->memory_backed) { nullb->dev->discard = false; pr_info("discard option is ignored without memory backing\n"); return; } if (nullb->dev->zoned) { nullb->dev->discard = false; pr_info("discard option is ignored in zoned mode\n"); return; } lim->max_hw_discard_sectors = UINT_MAX >> 9; } static const struct block_device_operations null_ops = { .owner = THIS_MODULE, .report_zones = null_report_zones, }; static int setup_queues(struct nullb *nullb) { int nqueues = nr_cpu_ids; if (g_poll_queues) nqueues += g_poll_queues; nullb->queues = kcalloc(nqueues, sizeof(struct nullb_queue), GFP_KERNEL); if (!nullb->queues) return -ENOMEM; return 0; } static int null_init_tag_set(struct blk_mq_tag_set *set, int poll_queues) { set->ops = &null_mq_ops; set->cmd_size = sizeof(struct nullb_cmd); set->timeout = 5 * HZ; set->nr_maps = 1; if (poll_queues) { set->nr_hw_queues += poll_queues; set->nr_maps += 2; } return blk_mq_alloc_tag_set(set); } static int null_init_global_tag_set(void) { int error; if (tag_set.ops) return 0; tag_set.nr_hw_queues = g_submit_queues; tag_set.queue_depth = g_hw_queue_depth; tag_set.numa_node = g_home_node; if (g_no_sched) tag_set.flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT; if (g_shared_tag_bitmap) tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED; if (g_blocking) tag_set.flags |= BLK_MQ_F_BLOCKING; error = null_init_tag_set(&tag_set, g_poll_queues); if (error) tag_set.ops = NULL; return error; } static int null_setup_tagset(struct nullb *nullb) { if (nullb->dev->shared_tags) { nullb->tag_set = &tag_set; return null_init_global_tag_set(); } nullb->tag_set = &nullb->__tag_set; nullb->tag_set->driver_data = nullb; nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues; nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth; nullb->tag_set->numa_node = nullb->dev->home_node; if (nullb->dev->no_sched) nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT; if (nullb->dev->shared_tag_bitmap) nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; if (nullb->dev->blocking) nullb->tag_set->flags |= BLK_MQ_F_BLOCKING; return null_init_tag_set(nullb->tag_set, nullb->dev->poll_queues); } static int null_validate_conf(struct nullb_device *dev) { if (dev->queue_mode == NULL_Q_RQ) { pr_err("legacy IO path is no longer available\n"); return -EINVAL; } if (dev->queue_mode == NULL_Q_BIO) { pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n"); dev->queue_mode = NULL_Q_MQ; } if (dev->use_per_node_hctx) { if (dev->submit_queues != nr_online_nodes) dev->submit_queues = nr_online_nodes; } else if (dev->submit_queues > nr_cpu_ids) dev->submit_queues = nr_cpu_ids; else if (dev->submit_queues == 0) dev->submit_queues = 1; dev->prev_submit_queues = dev->submit_queues; if (dev->poll_queues > g_poll_queues) dev->poll_queues = g_poll_queues; dev->prev_poll_queues = dev->poll_queues; dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); /* Do memory allocation, so set blocking */ if (dev->memory_backed) dev->blocking = true; else /* cache is meaningless */ dev->cache_size = 0; dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, dev->cache_size); dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); if (dev->zoned && (!dev->zone_size || !is_power_of_2(dev->zone_size))) { pr_err("zone_size must be power-of-two\n"); return -EINVAL; } return 0; } #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static bool __null_setup_fault(struct fault_attr *attr, char *str) { if (!str[0]) return true; if (!setup_fault_attr(attr, str)) return false; attr->verbose = 0; return true; } #endif static bool null_setup_fault(void) { #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) return false; if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) return false; if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) return false; #endif return true; } static int null_add_dev(struct nullb_device *dev) { struct queue_limits lim = { .logical_block_size = dev->blocksize, .physical_block_size = dev->blocksize, .max_hw_sectors = dev->max_sectors, .dma_alignment = dev->blocksize - 1, }; struct nullb *nullb; int rv; rv = null_validate_conf(dev); if (rv) return rv; nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); if (!nullb) { rv = -ENOMEM; goto out; } nullb->dev = dev; dev->nullb = nullb; spin_lock_init(&nullb->lock); rv = setup_queues(nullb); if (rv) goto out_free_nullb; rv = null_setup_tagset(nullb); if (rv) goto out_cleanup_queues; if (dev->virt_boundary) lim.virt_boundary_mask = PAGE_SIZE - 1; null_config_discard(nullb, &lim); if (dev->zoned) { rv = null_init_zoned_dev(dev, &lim); if (rv) goto out_cleanup_tags; } if (dev->cache_size > 0) { set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); lim.features |= BLK_FEAT_WRITE_CACHE; if (dev->fua) lim.features |= BLK_FEAT_FUA; } if (dev->rotational) lim.features |= BLK_FEAT_ROTATIONAL; nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb); if (IS_ERR(nullb->disk)) { rv = PTR_ERR(nullb->disk); goto out_cleanup_zone; } nullb->q = nullb->disk->queue; if (dev->mbps) { set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); nullb_setup_bwtimer(nullb); } nullb->q->queuedata = nullb; rv = ida_alloc(&nullb_indexes, GFP_KERNEL); if (rv < 0) goto out_cleanup_disk; nullb->index = rv; dev->index = rv; if (config_item_name(&dev->group.cg_item)) { /* Use configfs dir name as the device name */ snprintf(nullb->disk_name, sizeof(nullb->disk_name), "%s", config_item_name(&dev->group.cg_item)); } else { sprintf(nullb->disk_name, "nullb%d", nullb->index); } set_capacity(nullb->disk, ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT); nullb->disk->major = null_major; nullb->disk->first_minor = nullb->index; nullb->disk->minors = 1; nullb->disk->fops = &null_ops; nullb->disk->private_data = nullb; strscpy(nullb->disk->disk_name, nullb->disk_name); if (nullb->dev->zoned) { rv = null_register_zoned_dev(nullb); if (rv) goto out_ida_free; } rv = add_disk(nullb->disk); if (rv) goto out_ida_free; list_add_tail(&nullb->list, &nullb_list); pr_info("disk %s created\n", nullb->disk_name); return 0; out_ida_free: ida_free(&nullb_indexes, nullb->index); out_cleanup_disk: put_disk(nullb->disk); out_cleanup_zone: null_free_zoned_dev(dev); out_cleanup_tags: if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); out_cleanup_queues: kfree(nullb->queues); out_free_nullb: kfree(nullb); dev->nullb = NULL; out: return rv; } static struct nullb *null_find_dev_by_name(const char *name) { struct nullb *nullb = NULL, *nb; mutex_lock(&lock); list_for_each_entry(nb, &nullb_list, list) { if (strcmp(nb->disk_name, name) == 0) { nullb = nb; break; } } mutex_unlock(&lock); return nullb; } static int null_create_dev(void) { struct nullb_device *dev; int ret; dev = null_alloc_dev(); if (!dev) return -ENOMEM; mutex_lock(&lock); ret = null_add_dev(dev); mutex_unlock(&lock); if (ret) { null_free_dev(dev); return ret; } return 0; } static void null_destroy_dev(struct nullb *nullb) { struct nullb_device *dev = nullb->dev; null_del_dev(nullb); null_free_device_storage(dev, false); null_free_dev(dev); } static int __init null_init(void) { int ret = 0; unsigned int i; struct nullb *nullb; if (g_bs > PAGE_SIZE) { pr_warn("invalid block size\n"); pr_warn("defaults block size to %lu\n", PAGE_SIZE); g_bs = PAGE_SIZE; } if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { pr_err("invalid home_node value\n"); g_home_node = NUMA_NO_NODE; } if (!null_setup_fault()) return -EINVAL; if (g_queue_mode == NULL_Q_RQ) { pr_err("legacy IO path is no longer available\n"); return -EINVAL; } if (g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { pr_warn("submit_queues param is set to %u.\n", nr_online_nodes); g_submit_queues = nr_online_nodes; } } else if (g_submit_queues > nr_cpu_ids) { g_submit_queues = nr_cpu_ids; } else if (g_submit_queues <= 0) { g_submit_queues = 1; } config_group_init(&nullb_subsys.su_group); mutex_init(&nullb_subsys.su_mutex); ret = configfs_register_subsystem(&nullb_subsys); if (ret) return ret; mutex_init(&lock); null_major = register_blkdev(0, "nullb"); if (null_major < 0) { ret = null_major; goto err_conf; } for (i = 0; i < nr_devices; i++) { ret = null_create_dev(); if (ret) goto err_dev; } pr_info("module loaded\n"); return 0; err_dev: while (!list_empty(&nullb_list)) { nullb = list_entry(nullb_list.next, struct nullb, list); null_destroy_dev(nullb); } unregister_blkdev(null_major, "nullb"); err_conf: configfs_unregister_subsystem(&nullb_subsys); return ret; } static void __exit null_exit(void) { struct nullb *nullb; configfs_unregister_subsystem(&nullb_subsys); unregister_blkdev(null_major, "nullb"); mutex_lock(&lock); while (!list_empty(&nullb_list)) { nullb = list_entry(nullb_list.next, struct nullb, list); null_destroy_dev(nullb); } mutex_unlock(&lock); if (tag_set.ops) blk_mq_free_tag_set(&tag_set); mutex_destroy(&lock); } module_init(null_init); module_exit(null_exit); MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); MODULE_DESCRIPTION("multi queue aware block test driver"); MODULE_LICENSE("GPL"); |
| 2 1 2 2 1 2 32 19 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 | // SPDX-License-Identifier: GPL-2.0 /* * Zoned block device handling * * Copyright (c) 2015, Hannes Reinecke * Copyright (c) 2015, SUSE Linux GmbH * * Copyright (c) 2016, Damien Le Moal * Copyright (c) 2016, Western Digital * Copyright (c) 2024, Western Digital Corporation or its affiliates. */ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/spinlock.h> #include <linux/refcount.h> #include <linux/mempool.h> #include <trace/events/block.h> #include "blk.h" #include "blk-mq-sched.h" #include "blk-mq-debugfs.h" #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name static const char *const zone_cond_name[] = { ZONE_COND_NAME(NOT_WP), ZONE_COND_NAME(EMPTY), ZONE_COND_NAME(IMP_OPEN), ZONE_COND_NAME(EXP_OPEN), ZONE_COND_NAME(CLOSED), ZONE_COND_NAME(READONLY), ZONE_COND_NAME(FULL), ZONE_COND_NAME(OFFLINE), }; #undef ZONE_COND_NAME /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. * @ref: Zone write plug reference counter. A zone write plug reference is * always at least 1 when the plug is hashed in the disk plug hash table. * The reference is incremented whenever a new BIO needing plugging is * submitted and when a function needs to manipulate a plug. The * reference count is decremented whenever a plugged BIO completes and * when a function that referenced the plug returns. The initial * reference is dropped whenever the zone of the zone write plug is reset, * finished and when the zone becomes full (last write BIO to the zone * completes). * @lock: Spinlock to atomically manipulate the plug. * @flags: Flags indicating the plug state. * @zone_no: The number of the zone the plug is managing. * @wp_offset: The zone write pointer location relative to the start of the zone * as a number of 512B sectors. * @bio_list: The list of BIOs that are currently plugged. * @bio_work: Work struct to handle issuing of plugged BIOs * @rcu_head: RCU head to free zone write plugs with an RCU grace period. * @disk: The gendisk the plug belongs to. */ struct blk_zone_wplug { struct hlist_node node; refcount_t ref; spinlock_t lock; unsigned int flags; unsigned int zone_no; unsigned int wp_offset; struct bio_list bio_list; struct work_struct bio_work; struct rcu_head rcu_head; struct gendisk *disk; }; /* * Zone write plug flags bits: * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, * that is, that write BIOs are being throttled due to a write BIO already * being executed or the zone write plug bio list is not empty. * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone * write pointer offset and need to update it. * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed * from the disk hash table and that the initial reference to the zone * write plug set when the plug was first added to the hash table has been * dropped. This flag is set when a zone is reset, finished or become full, * to prevent new references to the zone write plug to be taken for * newly incoming BIOs. A zone write plug flagged with this flag will be * freed once all remaining references from BIOs or functions are dropped. */ #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) /** * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. * @zone_cond: BLK_ZONE_COND_XXX. * * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX * into string format. Useful in the debugging and tracing zone conditions. For * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) { static const char *zone_cond_str = "UNKNOWN"; if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) zone_cond_str = zone_cond_name[zone_cond]; return zone_cond_str; } EXPORT_SYMBOL_GPL(blk_zone_cond_str); struct disk_report_zones_cb_args { struct gendisk *disk; report_zones_cb user_cb; void *user_data; }; static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, struct blk_zone *zone); static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct disk_report_zones_cb_args *args = data; struct gendisk *disk = args->disk; if (disk->zone_wplugs_hash) disk_zone_wplug_sync_wp_offset(disk, zone); if (!args->user_cb) return 0; return args->user_cb(zone, idx, args->user_data); } /** * blkdev_report_zones - Get zones information * @bdev: Target block device * @sector: Sector from which to report zones * @nr_zones: Maximum number of zones to report * @cb: Callback function called for each reported zone * @data: Private data for the callback * * Description: * Get zone information starting from the zone containing @sector for at most * @nr_zones, and call @cb for each zone reported by the device. * To report all zones in a device starting from @sector, the BLK_ALL_ZONES * constant can be passed to @nr_zones. * Returns the number of zones reported by the device, or a negative errno * value in case of failure. * * Note: The caller must use memalloc_noXX_save/restore() calls to control * memory allocations done within this function. */ int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct gendisk *disk = bdev->bd_disk; sector_t capacity = get_capacity(disk); struct disk_report_zones_cb_args args = { .disk = disk, .user_cb = cb, .user_data = data, }; if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) return -EOPNOTSUPP; if (!nr_zones || sector >= capacity) return 0; return disk->fops->report_zones(disk, sector, nr_zones, disk_report_zones_cb, &args); } EXPORT_SYMBOL_GPL(blkdev_report_zones); static int blkdev_zone_reset_all(struct block_device *bdev) { struct bio bio; bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); trace_blkdev_zone_mgmt(&bio, 0); return submit_bio_wait(&bio); } /** * blkdev_zone_mgmt - Execute a zone management operation on a range of zones * @bdev: Target block device * @op: Operation to be performed on the zones * @sector: Start sector of the first zone to operate on * @nr_sectors: Number of sectors, should be at least the length of one zone and * must be zone size aligned. * * Description: * Perform the specified operation on the range of zones specified by * @sector..@sector+@nr_sectors. Specifying the entire disk sector range * is valid, but the specified range should not contain conventional zones. * The operation to execute on each zone can be a zone reset, open, close * or finish request. */ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sector, sector_t nr_sectors) { sector_t zone_sectors = bdev_zone_sectors(bdev); sector_t capacity = bdev_nr_sectors(bdev); sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; int ret = 0; if (!bdev_is_zoned(bdev)) return -EOPNOTSUPP; if (bdev_read_only(bdev)) return -EPERM; if (!op_is_zone_mgmt(op)) return -EOPNOTSUPP; if (end_sector <= sector || end_sector > capacity) /* Out of range */ return -EINVAL; /* Check alignment (handle eventual smaller last zone) */ if (!bdev_is_zone_start(bdev, sector)) return -EINVAL; if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) return -EINVAL; /* * In the case of a zone reset operation over all zones, use * REQ_OP_ZONE_RESET_ALL. */ if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) return blkdev_zone_reset_all(bdev); while (sector < end_sector) { bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); bio->bi_iter.bi_sector = sector; sector += zone_sectors; /* This may take a while, so be nice to others */ cond_resched(); } trace_blkdev_zone_mgmt(bio, nr_sectors); ret = submit_bio_wait(bio); bio_put(bio); return ret; } EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); struct zone_report_args { struct blk_zone __user *zones; }; static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, void *data) { struct zone_report_args *args = data; if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) return -EFAULT; return 0; } /* * BLKREPORTZONE ioctl processing. * Called from blkdev_ioctl. */ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct zone_report_args args; struct blk_zone_report rep; int ret; if (!argp) return -EINVAL; if (!bdev_is_zoned(bdev)) return -ENOTTY; if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) return -EFAULT; if (!rep.nr_zones) return -EINVAL; args.zones = argp + sizeof(struct blk_zone_report); ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, blkdev_copy_zone_to_user, &args); if (ret < 0) return ret; rep.nr_zones = ret; rep.flags = BLK_ZONE_REP_CAPACITY; if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) return -EFAULT; return 0; } static int blkdev_truncate_zone_range(struct block_device *bdev, blk_mode_t mode, const struct blk_zone_range *zrange) { loff_t start, end; if (zrange->sector + zrange->nr_sectors <= zrange->sector || zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) /* Out of range */ return -EINVAL; start = zrange->sector << SECTOR_SHIFT; end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; return truncate_bdev_range(bdev, mode, start, end); } /* * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. */ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct blk_zone_range zrange; enum req_op op; int ret; if (!argp) return -EINVAL; if (!bdev_is_zoned(bdev)) return -ENOTTY; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) return -EFAULT; switch (cmd) { case BLKRESETZONE: op = REQ_OP_ZONE_RESET; /* Invalidate the page cache, including dirty pages. */ inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); ret = blkdev_truncate_zone_range(bdev, mode, &zrange); if (ret) goto fail; break; case BLKOPENZONE: op = REQ_OP_ZONE_OPEN; break; case BLKCLOSEZONE: op = REQ_OP_ZONE_CLOSE; break; case BLKFINISHZONE: op = REQ_OP_ZONE_FINISH; break; default: return -ENOTTY; } ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); fail: if (cmd == BLKRESETZONE) { filemap_invalidate_unlock(bdev->bd_mapping); inode_unlock(bdev->bd_mapping->host); } return ret; } static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) { return zone->start + zone->len >= get_capacity(disk); } static bool disk_zone_is_full(struct gendisk *disk, unsigned int zno, unsigned int offset_in_zone) { if (zno < disk->nr_zones - 1) return offset_in_zone >= disk->zone_capacity; return offset_in_zone >= disk->last_zone_capacity; } static bool disk_zone_wplug_is_full(struct gendisk *disk, struct blk_zone_wplug *zwplug) { return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); } static bool disk_insert_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { struct blk_zone_wplug *zwplg; unsigned long flags; unsigned int idx = hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); /* * Add the new zone write plug to the hash table, but carefully as we * are racing with other submission context, so we may already have a * zone write plug for the same zone. */ spin_lock_irqsave(&disk->zone_wplugs_lock, flags); hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { if (zwplg->zone_no == zwplug->zone_no) { spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); return false; } } hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); atomic_inc(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); return true; } static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, sector_t sector) { unsigned int zno = disk_zone_no(disk, sector); unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); struct blk_zone_wplug *zwplug; rcu_read_lock(); hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { if (zwplug->zone_no == zno && refcount_inc_not_zero(&zwplug->ref)) { rcu_read_unlock(); return zwplug; } } rcu_read_unlock(); return NULL; } static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, sector_t sector) { if (!atomic_read(&disk->nr_zone_wplugs)) return NULL; return disk_get_hashed_zone_wplug(disk, sector); } static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) { struct blk_zone_wplug *zwplug = container_of(rcu_head, struct blk_zone_wplug, rcu_head); mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); } static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) { if (refcount_dec_and_test(&zwplug->ref)) { WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); } } static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { lockdep_assert_held(&zwplug->lock); /* If the zone write plug was already removed, we are done. */ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) return false; /* If the zone write plug is still plugged, it cannot be removed. */ if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) return false; /* * Completions of BIOs with blk_zone_write_plug_bio_endio() may * happen after handling a request completion with * blk_zone_write_plug_finish_request() (e.g. with split BIOs * that are chained). In such case, disk_zone_wplug_unplug_bio() * should not attempt to remove the zone write plug until all BIO * completions are seen. Check by looking at the zone write plug * reference count, which is 2 when the plug is unused (one reference * taken when the plug was allocated and another reference taken by the * caller context). */ if (refcount_read(&zwplug->ref) > 2) return false; /* We can remove zone write plugs for zones that are empty or full. */ return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); } static void disk_remove_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { unsigned long flags; /* If the zone write plug was already removed, we have nothing to do. */ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) return; /* * Mark the zone write plug as unhashed and drop the extra reference we * took when the plug was inserted in the hash table. */ zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); hlist_del_init_rcu(&zwplug->node); atomic_dec(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); disk_put_zone_wplug(zwplug); } static void blk_zone_wplug_bio_work(struct work_struct *work); /* * Get a reference on the write plug for the zone containing @sector. * If the plug does not exist, it is allocated and hashed. * Return a pointer to the zone write plug with the plug spinlock held. */ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, sector_t sector, gfp_t gfp_mask, unsigned long *flags) { unsigned int zno = disk_zone_no(disk, sector); struct blk_zone_wplug *zwplug; again: zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { /* * Check that a BIO completion or a zone reset or finish * operation has not already removed the zone write plug from * the hash table and dropped its reference count. In such case, * we need to get a new plug so start over from the beginning. */ spin_lock_irqsave(&zwplug->lock, *flags); if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { spin_unlock_irqrestore(&zwplug->lock, *flags); disk_put_zone_wplug(zwplug); goto again; } return zwplug; } /* * Allocate and initialize a zone write plug with an extra reference * so that it is not freed when the zone write plug becomes idle without * the zone being full. */ zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); if (!zwplug) return NULL; INIT_HLIST_NODE(&zwplug->node); refcount_set(&zwplug->ref, 2); spin_lock_init(&zwplug->lock); zwplug->flags = 0; zwplug->zone_no = zno; zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); bio_list_init(&zwplug->bio_list); INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); zwplug->disk = disk; spin_lock_irqsave(&zwplug->lock, *flags); /* * Insert the new zone write plug in the hash table. This can fail only * if another context already inserted a plug. Retry from the beginning * in such case. */ if (!disk_insert_zone_wplug(disk, zwplug)) { spin_unlock_irqrestore(&zwplug->lock, *flags); mempool_free(zwplug, disk->zone_wplugs_pool); goto again; } return zwplug; } static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, struct bio *bio) { struct request_queue *q = zwplug->disk->queue; bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); bio_io_error(bio); disk_put_zone_wplug(zwplug); /* Drop the reference taken by disk_zone_wplug_add_bio(() */ blk_queue_exit(q); } /* * Abort (fail) all plugged BIOs of a zone write plug. */ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { struct bio *bio; if (bio_list_empty(&zwplug->bio_list)) return; pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", zwplug->disk->disk_name, zwplug->zone_no); while ((bio = bio_list_pop(&zwplug->bio_list))) blk_zone_wplug_bio_io_error(zwplug, bio); } /* * Set a zone write plug write pointer offset to the specified value. * This aborts all plugged BIOs, which is fine as this function is called for * a zone reset operation, a zone finish operation or if the zone needs a wp * update from a report zone after a write error. */ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, struct blk_zone_wplug *zwplug, unsigned int wp_offset) { lockdep_assert_held(&zwplug->lock); /* Update the zone write pointer and abort all plugged BIOs. */ zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; zwplug->wp_offset = wp_offset; disk_zone_wplug_abort(zwplug); /* * The zone write plug now has no BIO plugged: remove it from the * hash table so that it cannot be seen. The plug will be freed * when the last reference is dropped. */ if (disk_should_remove_zone_wplug(disk, zwplug)) disk_remove_zone_wplug(disk, zwplug); } static unsigned int blk_zone_wp_offset(struct blk_zone *zone) { switch (zone->cond) { case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: return zone->wp - zone->start; case BLK_ZONE_COND_FULL: return zone->len; case BLK_ZONE_COND_EMPTY: return 0; case BLK_ZONE_COND_NOT_WP: case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: default: /* * Conventional, offline and read-only zones do not have a valid * write pointer. */ return UINT_MAX; } } static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, struct blk_zone *zone) { struct blk_zone_wplug *zwplug; unsigned long flags; zwplug = disk_get_zone_wplug(disk, zone->start); if (!zwplug) return; spin_lock_irqsave(&zwplug->lock, flags); if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) disk_zone_wplug_set_wp_offset(disk, zwplug, blk_zone_wp_offset(zone)); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); } static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector) { struct disk_report_zones_cb_args args = { .disk = disk, }; return disk->fops->report_zones(disk, sector, 1, disk_report_zones_cb, &args); } static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, unsigned int wp_offset) { struct gendisk *disk = bio->bi_bdev->bd_disk; sector_t sector = bio->bi_iter.bi_sector; struct blk_zone_wplug *zwplug; unsigned long flags; /* Conventional zones cannot be reset nor finished. */ if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { bio_io_error(bio); return true; } /* * No-wait reset or finish BIOs do not make much sense as the callers * issue these as blocking operations in most cases. To avoid issues * the BIO execution potentially failing with BLK_STS_AGAIN, warn about * REQ_NOWAIT being set and ignore that flag. */ if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) bio->bi_opf &= ~REQ_NOWAIT; /* * If we have a zone write plug, set its write pointer offset to 0 * (reset case) or to the zone size (finish case). This will abort all * BIOs plugged for the target zone. It is fine as resetting or * finishing zones while writes are still in-flight will result in the * writes failing anyway. */ zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { spin_lock_irqsave(&zwplug->lock, flags); disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); } return false; } static bool blk_zone_wplug_handle_reset_all(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; struct blk_zone_wplug *zwplug; unsigned long flags; sector_t sector; /* * Set the write pointer offset of all zone write plugs to 0. This will * abort all plugged BIOs. It is fine as resetting zones while writes * are still in-flight will result in the writes failing anyway. */ for (sector = 0; sector < get_capacity(disk); sector += disk->queue->limits.chunk_sectors) { zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { spin_lock_irqsave(&zwplug->lock, flags); disk_zone_wplug_set_wp_offset(disk, zwplug, 0); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); } } return false; } static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, struct blk_zone_wplug *zwplug) { /* * Take a reference on the zone write plug and schedule the submission * of the next plugged BIO. blk_zone_wplug_bio_work() will release the * reference we take here. */ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); refcount_inc(&zwplug->ref); queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); } static inline void disk_zone_wplug_add_bio(struct gendisk *disk, struct blk_zone_wplug *zwplug, struct bio *bio, unsigned int nr_segs) { bool schedule_bio_work = false; /* * Grab an extra reference on the BIO request queue usage counter. * This reference will be reused to submit a request for the BIO for * blk-mq devices and dropped when the BIO is failed and after * it is issued in the case of BIO-based devices. */ percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); /* * The BIO is being plugged and thus will have to wait for the on-going * write and for all other writes already plugged. So polling makes * no sense. */ bio_clear_polled(bio); /* * REQ_NOWAIT BIOs are always handled using the zone write plug BIO * work, which can block. So clear the REQ_NOWAIT flag and schedule the * work if this is the first BIO we are plugging. */ if (bio->bi_opf & REQ_NOWAIT) { schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); bio->bi_opf &= ~REQ_NOWAIT; } /* * Reuse the poll cookie field to store the number of segments when * split to the hardware limits. */ bio->__bi_nr_segments = nr_segs; /* * We always receive BIOs after they are split and ready to be issued. * The block layer passes the parts of a split BIO in order, and the * user must also issue write sequentially. So simply add the new BIO * at the tail of the list to preserve the sequential write order. */ bio_list_add(&zwplug->bio_list, bio); trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; if (schedule_bio_work) disk_zone_wplug_schedule_bio_work(disk, zwplug); } /* * Called from bio_attempt_back_merge() when a BIO was merged with a request. */ void blk_zone_write_plug_bio_merged(struct bio *bio) { struct blk_zone_wplug *zwplug; unsigned long flags; /* * If the BIO was already plugged, then we were called through * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). * For this case, we already hold a reference on the zone write plug for * the BIO and blk_zone_write_plug_init_request() will handle the * zone write pointer offset update. */ if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) return; bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); /* * Get a reference on the zone write plug of the target zone and advance * the zone write pointer offset. Given that this is a merge, we already * have at least one request and one BIO referencing the zone write * plug. So this should not fail. */ zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); if (WARN_ON_ONCE(!zwplug)) return; spin_lock_irqsave(&zwplug->lock, flags); zwplug->wp_offset += bio_sectors(bio); spin_unlock_irqrestore(&zwplug->lock, flags); } /* * Attempt to merge plugged BIOs with a newly prepared request for a BIO that * already went through zone write plugging (either a new BIO or one that was * unplugged). */ void blk_zone_write_plug_init_request(struct request *req) { sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); struct request_queue *q = req->q; struct gendisk *disk = q->disk; struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, blk_rq_pos(req)); unsigned long flags; struct bio *bio; if (WARN_ON_ONCE(!zwplug)) return; /* * Indicate that completion of this request needs to be handled with * blk_zone_write_plug_finish_request(), which will drop the reference * on the zone write plug we took above on entry to this function. */ req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; if (blk_queue_nomerges(q)) return; /* * Walk through the list of plugged BIOs to check if they can be merged * into the back of the request. */ spin_lock_irqsave(&zwplug->lock, flags); while (!disk_zone_wplug_is_full(disk, zwplug)) { bio = bio_list_peek(&zwplug->bio_list); if (!bio) break; if (bio->bi_iter.bi_sector != req_back_sector || !blk_rq_merge_ok(req, bio)) break; WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && !bio->__bi_nr_segments); bio_list_pop(&zwplug->bio_list); if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != BIO_MERGE_OK) { bio_list_add_head(&zwplug->bio_list, bio); break; } /* Drop the reference taken by disk_zone_wplug_add_bio(). */ blk_queue_exit(q); zwplug->wp_offset += bio_sectors(bio); req_back_sector += bio_sectors(bio); } spin_unlock_irqrestore(&zwplug->lock, flags); } /* * Check and prepare a BIO for submission by incrementing the write pointer * offset of its zone write plug and changing zone append operations into * regular write when zone append emulation is needed. */ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; lockdep_assert_held(&zwplug->lock); /* * If we lost track of the zone write pointer due to a write error, * the user must either execute a report zones, reset the zone or finish * the to recover a reliable write pointer position. Fail BIOs if the * user did not do that as we cannot handle emulated zone append * otherwise. */ if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) return false; /* * Check that the user is not attempting to write to a full zone. * We know such BIO will fail, and that would potentially overflow our * write pointer offset beyond the end of the zone. */ if (disk_zone_wplug_is_full(disk, zwplug)) return false; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { /* * Use a regular write starting at the current write pointer. * Similarly to native zone append operations, do not allow * merging. */ bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; bio->bi_iter.bi_sector += zwplug->wp_offset; /* * Remember that this BIO is in fact a zone append operation * so that we can restore its operation code on completion. */ bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); } else { /* * Check for non-sequential writes early as we know that BIOs * with a start sector not unaligned to the zone write pointer * will fail. */ if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) return false; } /* Advance the zone write pointer offset. */ zwplug->wp_offset += bio_sectors(bio); return true; } static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) { struct gendisk *disk = bio->bi_bdev->bd_disk; sector_t sector = bio->bi_iter.bi_sector; struct blk_zone_wplug *zwplug; gfp_t gfp_mask = GFP_NOIO; unsigned long flags; /* * BIOs must be fully contained within a zone so that we use the correct * zone write plug for the entire BIO. For blk-mq devices, the block * layer should already have done any splitting required to ensure this * and this BIO should thus not be straddling zone boundaries. For * BIO-based devices, it is the responsibility of the driver to split * the bio before submitting it. */ if (WARN_ON_ONCE(bio_straddles_zones(bio))) { bio_io_error(bio); return true; } /* Conventional zones do not need write plugging. */ if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { /* Zone append to conventional zones is not allowed. */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { bio_io_error(bio); return true; } return false; } if (bio->bi_opf & REQ_NOWAIT) gfp_mask = GFP_NOWAIT; zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); if (!zwplug) { if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); else bio_io_error(bio); return true; } /* Indicate that this BIO is being handled using zone write plugging. */ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); /* * If the zone is already plugged, add the BIO to the plug BIO list. * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a * BLK_STS_AGAIN failure if we let the BIO execute. * Otherwise, plug and let the BIO execute. */ if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) || (bio->bi_opf & REQ_NOWAIT)) goto plug; if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { spin_unlock_irqrestore(&zwplug->lock, flags); bio_io_error(bio); return true; } zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); return false; plug: disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); spin_unlock_irqrestore(&zwplug->lock, flags); return true; } static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; struct blk_zone_wplug *zwplug; unsigned long flags; /* * We have native support for zone append operations, so we are not * going to handle @bio through plugging. However, we may already have a * zone write plug for the target zone if that zone was previously * partially written using regular writes. In such case, we risk leaving * the plug in the disk hash table if the zone is fully written using * zone append operations. Avoid this by removing the zone write plug. */ zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); if (likely(!zwplug)) return; spin_lock_irqsave(&zwplug->lock, flags); /* * We are about to remove the zone write plug. But if the user * (mistakenly) has issued regular writes together with native zone * append, we must aborts the writes as otherwise the plugged BIOs would * not be executed by the plug BIO work as disk_get_zone_wplug() will * return NULL after the plug is removed. Aborting the plugged write * BIOs is consistent with the fact that these writes will most likely * fail anyway as there is no ordering guarantees between zone append * operations and regular write operations. */ if (!bio_list_empty(&zwplug->bio_list)) { pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", disk->disk_name, zwplug->zone_no); disk_zone_wplug_abort(zwplug); } disk_remove_zone_wplug(disk, zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); } /** * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging * @bio: The BIO being submitted * @nr_segs: The number of physical segments of @bio * * Handle write, write zeroes and zone append operations requiring emulation * using zone write plugging. * * Return true whenever @bio execution needs to be delayed through the zone * write plug. Otherwise, return false to let the submission path process * @bio normally. */ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) { struct block_device *bdev = bio->bi_bdev; if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash)) return false; /* * Regular writes and write zeroes need to be handled through the target * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH * which may need to go through the flush machinery depending on the * target device capabilities. Plugging such writes is fine as the flush * machinery operates at the request level, below the plug, and * completion of the flush sequence will go through the regular BIO * completion, which will handle zone write plugging. * Zone append operations for devices that requested emulation must * also be plugged so that these BIOs can be changed into regular * write BIOs. * Zone reset, reset all and finish commands need special treatment * to correctly track the write pointer offset of zones. These commands * are not plugged as we do not need serialization with write * operations. It is the responsibility of the user to not issue reset * and finish commands when write operations are in flight. */ switch (bio_op(bio)) { case REQ_OP_ZONE_APPEND: if (!bdev_emulates_zone_append(bdev)) { blk_zone_wplug_handle_native_zone_append(bio); return false; } fallthrough; case REQ_OP_WRITE: case REQ_OP_WRITE_ZEROES: return blk_zone_wplug_handle_write(bio, nr_segs); case REQ_OP_ZONE_RESET: return blk_zone_wplug_handle_reset_or_finish(bio, 0); case REQ_OP_ZONE_FINISH: return blk_zone_wplug_handle_reset_or_finish(bio, bdev_zone_sectors(bdev)); case REQ_OP_ZONE_RESET_ALL: return blk_zone_wplug_handle_reset_all(bio); default: return false; } return false; } EXPORT_SYMBOL_GPL(blk_zone_plug_bio); static void disk_zone_wplug_unplug_bio(struct gendisk *disk, struct blk_zone_wplug *zwplug) { unsigned long flags; spin_lock_irqsave(&zwplug->lock, flags); /* Schedule submission of the next plugged BIO if we have one. */ if (!bio_list_empty(&zwplug->bio_list)) { disk_zone_wplug_schedule_bio_work(disk, zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); return; } zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; /* * If the zone is full (it was fully written or finished, or empty * (it was reset), remove its zone write plug from the hash table. */ if (disk_should_remove_zone_wplug(disk, zwplug)) disk_remove_zone_wplug(disk, zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); } void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio) { /* * For zone append requests, the request sector indicates the location * at which the BIO data was written. Return this value to the BIO * issuer through the BIO iter sector. * For plugged zone writes, which include emulated zone append, we need * the original BIO sector so that blk_zone_write_plug_bio_endio() can * lookup the zone write plug. */ bio->bi_iter.bi_sector = rq->__sector; trace_blk_zone_append_update_request_bio(rq); } void blk_zone_write_plug_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); unsigned long flags; if (WARN_ON_ONCE(!zwplug)) return; /* Make sure we do not see this BIO again by clearing the plug flag. */ bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); /* * If this is a regular write emulating a zone append operation, * restore the original operation code. */ if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= REQ_OP_ZONE_APPEND; bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND); } /* * If the BIO failed, abort all plugged BIOs and mark the plug as * needing a write pointer update. */ if (bio->bi_status != BLK_STS_OK) { spin_lock_irqsave(&zwplug->lock, flags); disk_zone_wplug_abort(zwplug); zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; spin_unlock_irqrestore(&zwplug->lock, flags); } /* Drop the reference we took when the BIO was issued. */ disk_put_zone_wplug(zwplug); /* * For BIO-based devices, blk_zone_write_plug_finish_request() * is not called. So we need to schedule execution of the next * plugged BIO here. */ if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) disk_zone_wplug_unplug_bio(disk, zwplug); /* Drop the reference we took when entering this function. */ disk_put_zone_wplug(zwplug); } void blk_zone_write_plug_finish_request(struct request *req) { struct gendisk *disk = req->q->disk; struct blk_zone_wplug *zwplug; zwplug = disk_get_zone_wplug(disk, req->__sector); if (WARN_ON_ONCE(!zwplug)) return; req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; /* * Drop the reference we took when the request was initialized in * blk_zone_write_plug_init_request(). */ disk_put_zone_wplug(zwplug); disk_zone_wplug_unplug_bio(disk, zwplug); /* Drop the reference we took when entering this function. */ disk_put_zone_wplug(zwplug); } static void blk_zone_wplug_bio_work(struct work_struct *work) { struct blk_zone_wplug *zwplug = container_of(work, struct blk_zone_wplug, bio_work); struct block_device *bdev; unsigned long flags; struct bio *bio; bool prepared; /* * Submit the next plugged BIO. If we do not have any, clear * the plugged flag. */ again: spin_lock_irqsave(&zwplug->lock, flags); bio = bio_list_pop(&zwplug->bio_list); if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); goto put_zwplug; } trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); prepared = blk_zone_wplug_prepare_bio(zwplug, bio); spin_unlock_irqrestore(&zwplug->lock, flags); if (!prepared) { blk_zone_wplug_bio_io_error(zwplug, bio); goto again; } bdev = bio->bi_bdev; /* * blk-mq devices will reuse the extra reference on the request queue * usage counter we took when the BIO was plugged, but the submission * path for BIO-based devices will not do that. So drop this extra * reference here. */ if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { bdev->bd_disk->fops->submit_bio(bio); blk_queue_exit(bdev->bd_disk->queue); } else { blk_mq_submit_bio(bio); } put_zwplug: /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ disk_put_zone_wplug(zwplug); } static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) { return 1U << disk->zone_wplugs_hash_bits; } void disk_init_zone_resources(struct gendisk *disk) { spin_lock_init(&disk->zone_wplugs_lock); } /* * For the size of a disk zone write plug hash table, use the size of the * zone write plug mempool, which is the maximum of the disk open zones and * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, * 9 bits. For a disk that has no limits, mempool size defaults to 128. */ #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 static int disk_alloc_zone_resources(struct gendisk *disk, unsigned int pool_size) { unsigned int i; atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); disk->zone_wplugs_hash = kcalloc(disk_zone_wplugs_hash_size(disk), sizeof(struct hlist_head), GFP_KERNEL); if (!disk->zone_wplugs_hash) return -ENOMEM; for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, sizeof(struct blk_zone_wplug)); if (!disk->zone_wplugs_pool) goto free_hash; disk->zone_wplugs_wq = alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, pool_size, disk->disk_name); if (!disk->zone_wplugs_wq) goto destroy_pool; return 0; destroy_pool: mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; free_hash: kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; return -ENOMEM; } static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) { struct blk_zone_wplug *zwplug; unsigned int i; if (!disk->zone_wplugs_hash) return; /* Free all the zone write plugs we have. */ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { while (!hlist_empty(&disk->zone_wplugs_hash[i])) { zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, struct blk_zone_wplug, node); refcount_inc(&zwplug->ref); disk_remove_zone_wplug(disk, zwplug); disk_put_zone_wplug(zwplug); } } WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; } static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk, unsigned long *bitmap) { unsigned int nr_conv_zones = 0; unsigned long flags; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); if (bitmap) nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, lockdep_is_held(&disk->zone_wplugs_lock)); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); kfree_rcu_mightsleep(bitmap); return nr_conv_zones; } void disk_free_zone_resources(struct gendisk *disk) { if (!disk->zone_wplugs_pool) return; if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; } disk_destroy_zone_wplugs_hash_table(disk); /* * Wait for the zone write plugs to be RCU-freed before * destorying the mempool. */ rcu_barrier(); mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; disk_set_conv_zones_bitmap(disk, NULL); disk->zone_capacity = 0; disk->last_zone_capacity = 0; disk->nr_zones = 0; } static inline bool disk_need_zone_resources(struct gendisk *disk) { /* * All mq zoned devices need zone resources so that the block layer * can automatically handle write BIO plugging. BIO-based device drivers * (e.g. DM devices) are normally responsible for handling zone write * ordering and do not need zone resources, unless the driver requires * zone append emulation. */ return queue_is_mq(disk->queue) || queue_emulates_zone_append(disk->queue); } static int disk_revalidate_zone_resources(struct gendisk *disk, unsigned int nr_zones) { struct queue_limits *lim = &disk->queue->limits; unsigned int pool_size; if (!disk_need_zone_resources(disk)) return 0; /* * If the device has no limit on the maximum number of open and active * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. */ pool_size = max(lim->max_open_zones, lim->max_active_zones); if (!pool_size) pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); if (!disk->zone_wplugs_hash) return disk_alloc_zone_resources(disk, pool_size); return 0; } struct blk_revalidate_zone_args { struct gendisk *disk; unsigned long *conv_zones_bitmap; unsigned int nr_zones; unsigned int zone_capacity; unsigned int last_zone_capacity; sector_t sector; }; /* * Update the disk zone resources information and device queue limits. * The disk queue is frozen when this is executed. */ static int disk_update_zone_resources(struct gendisk *disk, struct blk_revalidate_zone_args *args) { struct request_queue *q = disk->queue; unsigned int nr_seq_zones, nr_conv_zones; unsigned int pool_size; struct queue_limits lim; disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; disk->last_zone_capacity = args->last_zone_capacity; nr_conv_zones = disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); if (nr_conv_zones >= disk->nr_zones) { pr_warn("%s: Invalid number of conventional zones %u / %u\n", disk->disk_name, nr_conv_zones, disk->nr_zones); return -ENODEV; } lim = queue_limits_start_update(q); /* * Some devices can advertize zone resource limits that are larger than * the number of sequential zones of the zoned block device, e.g. a * small ZNS namespace. For such case, assume that the zoned device has * no zone resource limits. */ nr_seq_zones = disk->nr_zones - nr_conv_zones; if (lim.max_open_zones >= nr_seq_zones) lim.max_open_zones = 0; if (lim.max_active_zones >= nr_seq_zones) lim.max_active_zones = 0; if (!disk->zone_wplugs_pool) goto commit; /* * If the device has no limit on the maximum number of open and active * zones, set its max open zone limit to the mempool size to indicate * to the user that there is a potential performance impact due to * dynamic zone write plug allocation when simultaneously writing to * more zones than the size of the mempool. */ pool_size = max(lim.max_open_zones, lim.max_active_zones); if (!pool_size) pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); mempool_resize(disk->zone_wplugs_pool, pool_size); if (!lim.max_open_zones && !lim.max_active_zones) { if (pool_size < nr_seq_zones) lim.max_open_zones = pool_size; else lim.max_open_zones = 0; } commit: return queue_limits_commit_update_frozen(q, &lim); } static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, struct blk_revalidate_zone_args *args) { struct gendisk *disk = args->disk; if (zone->capacity != zone->len) { pr_warn("%s: Invalid conventional zone capacity\n", disk->disk_name); return -ENODEV; } if (disk_zone_is_last(disk, zone)) args->last_zone_capacity = zone->capacity; if (!disk_need_zone_resources(disk)) return 0; if (!args->conv_zones_bitmap) { args->conv_zones_bitmap = bitmap_zalloc(args->nr_zones, GFP_NOIO); if (!args->conv_zones_bitmap) return -ENOMEM; } set_bit(idx, args->conv_zones_bitmap); return 0; } static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, struct blk_revalidate_zone_args *args) { struct gendisk *disk = args->disk; struct blk_zone_wplug *zwplug; unsigned int wp_offset; unsigned long flags; /* * Remember the capacity of the first sequential zone and check * if it is constant for all zones, ignoring the last zone as it can be * smaller. */ if (!args->zone_capacity) args->zone_capacity = zone->capacity; if (disk_zone_is_last(disk, zone)) { args->last_zone_capacity = zone->capacity; } else if (zone->capacity != args->zone_capacity) { pr_warn("%s: Invalid variable zone capacity\n", disk->disk_name); return -ENODEV; } /* * If the device needs zone append emulation, we need to track the * write pointer of all zones that are not empty nor full. So make sure * we have a zone write plug for such zone if the device has a zone * write plug hash table. */ if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) return 0; disk_zone_wplug_sync_wp_offset(disk, zone); wp_offset = blk_zone_wp_offset(zone); if (!wp_offset || wp_offset >= zone->capacity) return 0; zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); if (!zwplug) return -ENOMEM; spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); return 0; } /* * Helper function to check the validity of zones of a zoned block device. */ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_revalidate_zone_args *args = data; struct gendisk *disk = args->disk; sector_t zone_sectors = disk->queue->limits.chunk_sectors; int ret; /* Check for bad zones and holes in the zone report */ if (zone->start != args->sector) { pr_warn("%s: Zone gap at sectors %llu..%llu\n", disk->disk_name, args->sector, zone->start); return -ENODEV; } if (zone->start >= get_capacity(disk) || !zone->len) { pr_warn("%s: Invalid zone start %llu, length %llu\n", disk->disk_name, zone->start, zone->len); return -ENODEV; } /* * All zones must have the same size, with the exception on an eventual * smaller last zone. */ if (!disk_zone_is_last(disk, zone)) { if (zone->len != zone_sectors) { pr_warn("%s: Invalid zoned device with non constant zone size\n", disk->disk_name); return -ENODEV; } } else if (zone->len > zone_sectors) { pr_warn("%s: Invalid zoned device with larger last zone size\n", disk->disk_name); return -ENODEV; } if (!zone->capacity || zone->capacity > zone->len) { pr_warn("%s: Invalid zone capacity\n", disk->disk_name); return -ENODEV; } /* Check zone type */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: ret = blk_revalidate_conv_zone(zone, idx, args); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: ret = blk_revalidate_seq_zone(zone, idx, args); break; case BLK_ZONE_TYPE_SEQWRITE_PREF: default: pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", disk->disk_name, (int)zone->type, zone->start); ret = -ENODEV; } if (!ret) args->sector += zone->len; return ret; } /** * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs * @disk: Target disk * * Helper function for low-level device drivers to check, (re) allocate and * initialize resources used for managing zoned disks. This function should * normally be called by blk-mq based drivers when a zoned gendisk is probed * and when the zone configuration of the gendisk changes (e.g. after a format). * Before calling this function, the device driver must already have set the * device zone size (chunk_sector limit) and the max zone append limit. * BIO based drivers can also use this function as long as the device queue * can be safely frozen. */ int blk_revalidate_disk_zones(struct gendisk *disk) { struct request_queue *q = disk->queue; sector_t zone_sectors = q->limits.chunk_sectors; sector_t capacity = get_capacity(disk); struct blk_revalidate_zone_args args = { }; unsigned int noio_flag; int ret = -ENOMEM; if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) return -EIO; if (!capacity) return -ENODEV; /* * Checks that the device driver indicated a valid zone size and that * the max zone append limit is set. */ if (!zone_sectors || !is_power_of_2(zone_sectors)) { pr_warn("%s: Invalid non power of two zone size (%llu)\n", disk->disk_name, zone_sectors); return -ENODEV; } /* * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. */ args.disk = disk; args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); noio_flag = memalloc_noio_save(); ret = disk_revalidate_zone_resources(disk, args.nr_zones); if (ret) { memalloc_noio_restore(noio_flag); return ret; } ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); if (!ret) { pr_warn("%s: No zones reported\n", disk->disk_name); ret = -ENODEV; } memalloc_noio_restore(noio_flag); /* * If zones where reported, make sure that the entire disk capacity * has been checked. */ if (ret > 0 && args.sector != capacity) { pr_warn("%s: Missing zones from sector %llu\n", disk->disk_name, args.sector); ret = -ENODEV; } /* * Set the new disk zone parameters only once the queue is frozen and * all I/Os are completed. */ if (ret > 0) ret = disk_update_zone_resources(disk, &args); else pr_warn("%s: failed to revalidate zones\n", disk->disk_name); if (ret) { unsigned int memflags = blk_mq_freeze_queue(q); disk_free_zone_resources(disk); blk_mq_unfreeze_queue(q, memflags); } return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); /** * blk_zone_issue_zeroout - zero-fill a block range in a zone * @bdev: blockdev to write * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) * * Description: * Zero-fill a block range in a zone (@sector must be equal to the zone write * pointer), handling potential errors due to the (initially unknown) lack of * hardware offload (See blkdev_issue_zeroout()). */ int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask) { int ret; if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) return -EIO; ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, BLKDEV_ZERO_NOFALLBACK); if (ret != -EOPNOTSUPP) return ret; /* * The failed call to blkdev_issue_zeroout() advanced the zone write * pointer. Undo this using a report zone to update the zone write * pointer to the correct current value. */ ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); if (ret != 1) return ret < 0 ? ret : -EIO; /* * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a * regular write with zero-pages. */ return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0); } EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); #ifdef CONFIG_BLK_DEBUG_FS static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, struct seq_file *m) { unsigned int zwp_wp_offset, zwp_flags; unsigned int zwp_zone_no, zwp_ref; unsigned int zwp_bio_list_size; unsigned long flags; spin_lock_irqsave(&zwplug->lock, flags); zwp_zone_no = zwplug->zone_no; zwp_flags = zwplug->flags; zwp_ref = refcount_read(&zwplug->ref); zwp_wp_offset = zwplug->wp_offset; zwp_bio_list_size = bio_list_size(&zwplug->bio_list); spin_unlock_irqrestore(&zwplug->lock, flags); seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref, zwp_wp_offset, zwp_bio_list_size); } int queue_zone_wplugs_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct gendisk *disk = q->disk; struct blk_zone_wplug *zwplug; unsigned int i; if (!disk->zone_wplugs_hash) return 0; rcu_read_lock(); for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], node) queue_zone_wplug_show(zwplug, m); rcu_read_unlock(); return 0; } #endif |
| 22140 268 14750 253 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_PREEMPT_H #define __ASM_PREEMPT_H #include <asm/rmwcc.h> #include <asm/percpu.h> #include <linux/static_call_types.h> DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count); /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 /* * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such * that a decrement hitting 0 means we can and should reschedule. */ #define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) /* * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users * that think a non-zero value indicates we cannot preempt. */ static __always_inline int preempt_count(void) { return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) { int old, new; old = raw_cpu_read_4(__preempt_count); do { new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); } while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new)); } /* * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* * We fold the NEED_RESCHED bit into the preempt count such that * preempt_enable() can decrement and test for needing to reschedule with a * single instruction. * * We invert the actual bit, so that when the decrement hits 0 we know we both * need to resched (the bit is cleared) and can resched (no preempt count). */ static __always_inline void set_preempt_need_resched(void) { raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); } /* * The various preempt_count add/sub methods */ static __always_inline void __preempt_count_add(int val) { raw_cpu_add_4(__preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { raw_cpu_add_4(__preempt_count, -val); } /* * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule * a decrement which hits zero means we have no preempt_count and should * reschedule. */ static __always_inline bool __preempt_count_dec_and_test(void) { return GEN_UNARY_RMWcc("decl", __my_cpu_var(__preempt_count), e, __percpu_arg([var])); } /* * Returns true when we need to resched and can (barring IRQ state). */ static __always_inline bool should_resched(int preempt_offset) { return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPTION extern asmlinkage void preempt_schedule(void); extern asmlinkage void preempt_schedule_thunk(void); #define preempt_schedule_dynamic_enabled preempt_schedule_thunk #define preempt_schedule_dynamic_disabled NULL extern asmlinkage void preempt_schedule_notrace(void); extern asmlinkage void preempt_schedule_notrace_thunk(void); #define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace_thunk #define preempt_schedule_notrace_dynamic_disabled NULL #ifdef CONFIG_PREEMPT_DYNAMIC DECLARE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); #define __preempt_schedule() \ do { \ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \ } while (0) DECLARE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); #define __preempt_schedule_notrace() \ do { \ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \ } while (0) #else /* PREEMPT_DYNAMIC */ #define __preempt_schedule() \ asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT); #define __preempt_schedule_notrace() \ asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT); #endif /* PREEMPT_DYNAMIC */ #endif /* PREEMPTION */ #endif /* __ASM_PREEMPT_H */ |
| 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 | // SPDX-License-Identifier: GPL-2.0 #include <linux/fb.h> #include <linux/linux_logo.h> #include "fb_internal.h" bool fb_center_logo __read_mostly; int fb_logo_count __read_mostly = -1; static inline unsigned int safe_shift(unsigned int d, int n) { return n < 0 ? d >> -n : d << n; } static void fb_set_logocmap(struct fb_info *info, const struct linux_logo *logo) { struct fb_cmap palette_cmap; u16 palette_green[16]; u16 palette_blue[16]; u16 palette_red[16]; int i, j, n; const unsigned char *clut = logo->clut; palette_cmap.start = 0; palette_cmap.len = 16; palette_cmap.red = palette_red; palette_cmap.green = palette_green; palette_cmap.blue = palette_blue; palette_cmap.transp = NULL; for (i = 0; i < logo->clutsize; i += n) { n = logo->clutsize - i; /* palette_cmap provides space for only 16 colors at once */ if (n > 16) n = 16; palette_cmap.start = 32 + i; palette_cmap.len = n; for (j = 0; j < n; ++j) { palette_cmap.red[j] = clut[0] << 8 | clut[0]; palette_cmap.green[j] = clut[1] << 8 | clut[1]; palette_cmap.blue[j] = clut[2] << 8 | clut[2]; clut += 3; } fb_set_cmap(&palette_cmap, info); } } static void fb_set_logo_truepalette(struct fb_info *info, const struct linux_logo *logo, u32 *palette) { static const unsigned char mask[] = { 0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff }; unsigned char redmask, greenmask, bluemask; int redshift, greenshift, blueshift; int i; const unsigned char *clut = logo->clut; /* * We have to create a temporary palette since console palette is only * 16 colors long. */ /* Bug: Doesn't obey msb_right ... (who needs that?) */ redmask = mask[info->var.red.length < 8 ? info->var.red.length : 8]; greenmask = mask[info->var.green.length < 8 ? info->var.green.length : 8]; bluemask = mask[info->var.blue.length < 8 ? info->var.blue.length : 8]; redshift = info->var.red.offset - (8 - info->var.red.length); greenshift = info->var.green.offset - (8 - info->var.green.length); blueshift = info->var.blue.offset - (8 - info->var.blue.length); for (i = 0; i < logo->clutsize; i++) { palette[i+32] = (safe_shift((clut[0] & redmask), redshift) | safe_shift((clut[1] & greenmask), greenshift) | safe_shift((clut[2] & bluemask), blueshift)); clut += 3; } } static void fb_set_logo_directpalette(struct fb_info *info, const struct linux_logo *logo, u32 *palette) { int redshift, greenshift, blueshift; int i; redshift = info->var.red.offset; greenshift = info->var.green.offset; blueshift = info->var.blue.offset; for (i = 32; i < 32 + logo->clutsize; i++) palette[i] = i << redshift | i << greenshift | i << blueshift; } static void fb_set_logo(struct fb_info *info, const struct linux_logo *logo, u8 *dst, int depth) { int i, j, k; const u8 *src = logo->data; u8 xor = (info->fix.visual == FB_VISUAL_MONO01) ? 0xff : 0; u8 fg = 1, d; switch (fb_get_color_depth(&info->var, &info->fix)) { case 1: fg = 1; break; case 2: fg = 3; break; default: fg = 7; break; } if (info->fix.visual == FB_VISUAL_MONO01 || info->fix.visual == FB_VISUAL_MONO10) fg = ~((u8) (0xfff << info->var.green.length)); switch (depth) { case 4: for (i = 0; i < logo->height; i++) for (j = 0; j < logo->width; src++) { *dst++ = *src >> 4; j++; if (j < logo->width) { *dst++ = *src & 0x0f; j++; } } break; case 1: for (i = 0; i < logo->height; i++) { for (j = 0; j < logo->width; src++) { d = *src ^ xor; for (k = 7; k >= 0 && j < logo->width; k--) { *dst++ = ((d >> k) & 1) ? fg : 0; j++; } } } break; } } /* * Three (3) kinds of logo maps exist. linux_logo_clut224 (>16 colors), * linux_logo_vga16 (16 colors) and linux_logo_mono (2 colors). Depending on * the visual format and color depth of the framebuffer, the DAC, the * pseudo_palette, and the logo data will be adjusted accordingly. * * Case 1 - linux_logo_clut224: * Color exceeds the number of console colors (16), thus we set the hardware DAC * using fb_set_cmap() appropriately. The "needs_cmapreset" flag will be set. * * For visuals that require color info from the pseudo_palette, we also construct * one for temporary use. The "needs_directpalette" or "needs_truepalette" flags * will be set. * * Case 2 - linux_logo_vga16: * The number of colors just matches the console colors, thus there is no need * to set the DAC or the pseudo_palette. However, the bitmap is packed, ie, * each byte contains color information for two pixels (upper and lower nibble). * To be consistent with fb_imageblit() usage, we therefore separate the two * nibbles into separate bytes. The "depth" flag will be set to 4. * * Case 3 - linux_logo_mono: * This is similar with Case 2. Each byte contains information for 8 pixels. * We isolate each bit and expand each into a byte. The "depth" flag will * be set to 1. */ static struct logo_data { int depth; int needs_directpalette; int needs_truepalette; int needs_cmapreset; const struct linux_logo *logo; } fb_logo __read_mostly; static void fb_rotate_logo_ud(const u8 *in, u8 *out, u32 width, u32 height) { u32 size = width * height, i; out += size - 1; for (i = size; i--; ) *out-- = *in++; } static void fb_rotate_logo_cw(const u8 *in, u8 *out, u32 width, u32 height) { int i, j, h = height - 1; for (i = 0; i < height; i++) for (j = 0; j < width; j++) out[height * j + h - i] = *in++; } static void fb_rotate_logo_ccw(const u8 *in, u8 *out, u32 width, u32 height) { int i, j, w = width - 1; for (i = 0; i < height; i++) for (j = 0; j < width; j++) out[height * (w - j) + i] = *in++; } static void fb_rotate_logo(struct fb_info *info, u8 *dst, struct fb_image *image, int rotate) { u32 tmp; if (rotate == FB_ROTATE_UD) { fb_rotate_logo_ud(image->data, dst, image->width, image->height); image->dx = info->var.xres - image->width - image->dx; image->dy = info->var.yres - image->height - image->dy; } else if (rotate == FB_ROTATE_CW) { fb_rotate_logo_cw(image->data, dst, image->width, image->height); swap(image->width, image->height); tmp = image->dy; image->dy = image->dx; image->dx = info->var.xres - image->width - tmp; } else if (rotate == FB_ROTATE_CCW) { fb_rotate_logo_ccw(image->data, dst, image->width, image->height); swap(image->width, image->height); tmp = image->dx; image->dx = image->dy; image->dy = info->var.yres - image->height - tmp; } image->data = dst; } static void fb_do_show_logo(struct fb_info *info, struct fb_image *image, int rotate, unsigned int num) { unsigned int x; if (image->width > info->var.xres || image->height > info->var.yres) return; if (rotate == FB_ROTATE_UR) { for (x = 0; x < num && image->dx + image->width <= info->var.xres; x++) { info->fbops->fb_imageblit(info, image); image->dx += image->width + 8; } } else if (rotate == FB_ROTATE_UD) { u32 dx = image->dx; for (x = 0; x < num && image->dx <= dx; x++) { info->fbops->fb_imageblit(info, image); image->dx -= image->width + 8; } } else if (rotate == FB_ROTATE_CW) { for (x = 0; x < num && image->dy + image->height <= info->var.yres; x++) { info->fbops->fb_imageblit(info, image); image->dy += image->height + 8; } } else if (rotate == FB_ROTATE_CCW) { u32 dy = image->dy; for (x = 0; x < num && image->dy <= dy; x++) { info->fbops->fb_imageblit(info, image); image->dy -= image->height + 8; } } } static int fb_show_logo_line(struct fb_info *info, int rotate, const struct linux_logo *logo, int y, unsigned int n) { u32 *palette = NULL, *saved_pseudo_palette = NULL; unsigned char *logo_new = NULL, *logo_rotate = NULL; struct fb_image image; /* Return if the frame buffer is not mapped or suspended */ if (logo == NULL || info->state != FBINFO_STATE_RUNNING || info->fbops->owner) return 0; image.depth = 8; image.data = logo->data; if (fb_logo.needs_cmapreset) fb_set_logocmap(info, logo); if (fb_logo.needs_truepalette || fb_logo.needs_directpalette) { palette = kmalloc(256 * 4, GFP_KERNEL); if (palette == NULL) return 0; if (fb_logo.needs_truepalette) fb_set_logo_truepalette(info, logo, palette); else fb_set_logo_directpalette(info, logo, palette); saved_pseudo_palette = info->pseudo_palette; info->pseudo_palette = palette; } if (fb_logo.depth <= 4) { logo_new = kmalloc_array(logo->width, logo->height, GFP_KERNEL); if (logo_new == NULL) { kfree(palette); if (saved_pseudo_palette) info->pseudo_palette = saved_pseudo_palette; return 0; } image.data = logo_new; fb_set_logo(info, logo, logo_new, fb_logo.depth); } if (fb_center_logo) { int xres = info->var.xres; int yres = info->var.yres; if (rotate == FB_ROTATE_CW || rotate == FB_ROTATE_CCW) { xres = info->var.yres; yres = info->var.xres; } while (n && (n * (logo->width + 8) - 8 > xres)) --n; image.dx = (xres - (n * (logo->width + 8) - 8)) / 2; image.dy = y ?: (yres - logo->height) / 2; } else { image.dx = 0; image.dy = y; } image.width = logo->width; image.height = logo->height; if (rotate) { logo_rotate = kmalloc_array(logo->width, logo->height, GFP_KERNEL); if (logo_rotate) fb_rotate_logo(info, logo_rotate, &image, rotate); } fb_do_show_logo(info, &image, rotate, n); kfree(palette); if (saved_pseudo_palette != NULL) info->pseudo_palette = saved_pseudo_palette; kfree(logo_new); kfree(logo_rotate); return image.dy + logo->height; } #ifdef CONFIG_FB_LOGO_EXTRA #define FB_LOGO_EX_NUM_MAX 10 static struct logo_data_extra { const struct linux_logo *logo; unsigned int n; } fb_logo_ex[FB_LOGO_EX_NUM_MAX]; static unsigned int fb_logo_ex_num; void fb_append_extra_logo(const struct linux_logo *logo, unsigned int n) { if (!n || fb_logo_ex_num == FB_LOGO_EX_NUM_MAX) return; fb_logo_ex[fb_logo_ex_num].logo = logo; fb_logo_ex[fb_logo_ex_num].n = n; fb_logo_ex_num++; } static int fb_prepare_extra_logos(struct fb_info *info, unsigned int height, unsigned int yres) { unsigned int i; /* FIXME: logo_ex supports only truecolor fb. */ if (info->fix.visual != FB_VISUAL_TRUECOLOR) fb_logo_ex_num = 0; for (i = 0; i < fb_logo_ex_num; i++) { if (fb_logo_ex[i].logo->type != fb_logo.logo->type) { fb_logo_ex[i].logo = NULL; continue; } height += fb_logo_ex[i].logo->height; if (height > yres) { height -= fb_logo_ex[i].logo->height; fb_logo_ex_num = i; break; } } return height; } static int fb_show_extra_logos(struct fb_info *info, int y, int rotate) { unsigned int i; for (i = 0; i < fb_logo_ex_num; i++) y = fb_show_logo_line(info, rotate, fb_logo_ex[i].logo, y, fb_logo_ex[i].n); return y; } #endif /* CONFIG_FB_LOGO_EXTRA */ int fb_prepare_logo(struct fb_info *info, int rotate) { int depth = fb_get_color_depth(&info->var, &info->fix); unsigned int yres; int height; memset(&fb_logo, 0, sizeof(struct logo_data)); if (info->flags & FBINFO_MISC_TILEBLITTING || info->fbops->owner || !fb_logo_count) return 0; if (info->fix.visual == FB_VISUAL_DIRECTCOLOR) { depth = info->var.blue.length; if (info->var.red.length < depth) depth = info->var.red.length; if (info->var.green.length < depth) depth = info->var.green.length; } if (info->fix.visual == FB_VISUAL_STATIC_PSEUDOCOLOR && depth > 4) { /* assume console colormap */ depth = 4; } /* Return if no suitable logo was found */ fb_logo.logo = fb_find_logo(depth); if (!fb_logo.logo) return 0; if (rotate == FB_ROTATE_UR || rotate == FB_ROTATE_UD) yres = info->var.yres; else yres = info->var.xres; if (fb_logo.logo->height > yres) { fb_logo.logo = NULL; return 0; } /* What depth we asked for might be different from what we get */ if (fb_logo.logo->type == LINUX_LOGO_CLUT224) fb_logo.depth = 8; else if (fb_logo.logo->type == LINUX_LOGO_VGA16) fb_logo.depth = 4; else fb_logo.depth = 1; if (fb_logo.depth > 4 && depth > 4) { switch (info->fix.visual) { case FB_VISUAL_TRUECOLOR: fb_logo.needs_truepalette = 1; break; case FB_VISUAL_DIRECTCOLOR: fb_logo.needs_directpalette = 1; fb_logo.needs_cmapreset = 1; break; case FB_VISUAL_PSEUDOCOLOR: fb_logo.needs_cmapreset = 1; break; } } height = fb_logo.logo->height; if (fb_center_logo) height += (yres - fb_logo.logo->height) / 2; #ifdef CONFIG_FB_LOGO_EXTRA height = fb_prepare_extra_logos(info, height, yres); #endif return height; } int fb_show_logo(struct fb_info *info, int rotate) { unsigned int count; int y; if (!fb_logo_count) return 0; count = fb_logo_count < 0 ? num_online_cpus() : fb_logo_count; y = fb_show_logo_line(info, rotate, fb_logo.logo, 0, count); #ifdef CONFIG_FB_LOGO_EXTRA y = fb_show_extra_logos(info, y, rotate); #endif return y; } |
| 11 9 11 10 10 3 1 8 9 2 9 9 8 9 9 9 1 1 1 9 9 4 9 9 2 8 9 8 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-radio-rx.c - radio receiver support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/delay.h> #include <linux/videodev2.h> #include <linux/v4l2-dv-timings.h> #include <linux/sched/signal.h> #include <media/v4l2-common.h> #include <media/v4l2-event.h> #include <media/v4l2-dv-timings.h> #include "vivid-core.h" #include "vivid-ctrls.h" #include "vivid-radio-common.h" #include "vivid-rds-gen.h" #include "vivid-radio-rx.h" ssize_t vivid_radio_rx_read(struct file *file, char __user *buf, size_t size, loff_t *offset) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_rds_data *data = dev->rds_gen.data; bool use_alternates; ktime_t timestamp; unsigned blk; int perc; int i; if (dev->radio_rx_rds_controls) return -EINVAL; if (size < sizeof(*data)) return 0; size = sizeof(*data) * (size / sizeof(*data)); if (mutex_lock_interruptible(&dev->mutex)) return -ERESTARTSYS; if (dev->radio_rx_rds_owner && file_to_v4l2_fh(file) != dev->radio_rx_rds_owner) { mutex_unlock(&dev->mutex); return -EBUSY; } if (dev->radio_rx_rds_owner == NULL) { vivid_radio_rds_init(dev); dev->radio_rx_rds_owner = file_to_v4l2_fh(file); } retry: timestamp = ktime_sub(ktime_get(), dev->radio_rds_init_time); blk = ktime_divns(timestamp, VIVID_RDS_NSEC_PER_BLK); use_alternates = (blk % VIVID_RDS_GEN_BLOCKS) & 1; if (dev->radio_rx_rds_last_block == 0 || dev->radio_rx_rds_use_alternates != use_alternates) { dev->radio_rx_rds_use_alternates = use_alternates; /* Re-init the RDS generator */ vivid_radio_rds_init(dev); } if (blk >= dev->radio_rx_rds_last_block + VIVID_RDS_GEN_BLOCKS) dev->radio_rx_rds_last_block = blk - VIVID_RDS_GEN_BLOCKS + 1; /* * No data is available if there hasn't been time to get new data, * or if the RDS receiver has been disabled, or if we use the data * from the RDS transmitter and that RDS transmitter has been disabled, * or if the signal quality is too weak. */ if (blk == dev->radio_rx_rds_last_block || !dev->radio_rx_rds_enabled || (dev->radio_rds_loop && !(dev->radio_tx_subchans & V4L2_TUNER_SUB_RDS)) || abs(dev->radio_rx_sig_qual) > 200) { mutex_unlock(&dev->mutex); if (file->f_flags & O_NONBLOCK) return -EWOULDBLOCK; if (msleep_interruptible(20) && signal_pending(current)) return -EINTR; if (mutex_lock_interruptible(&dev->mutex)) return -ERESTARTSYS; goto retry; } /* abs(dev->radio_rx_sig_qual) <= 200, map that to a 0-50% range */ perc = abs(dev->radio_rx_sig_qual) / 4; for (i = 0; i < size && blk > dev->radio_rx_rds_last_block; dev->radio_rx_rds_last_block++) { unsigned data_blk = dev->radio_rx_rds_last_block % VIVID_RDS_GEN_BLOCKS; struct v4l2_rds_data rds = data[data_blk]; if (data_blk == 0 && dev->radio_rds_loop) vivid_radio_rds_init(dev); if (perc && get_random_u32_below(100) < perc) { switch (get_random_u32_below(4)) { case 0: rds.block |= V4L2_RDS_BLOCK_CORRECTED; break; case 1: rds.block |= V4L2_RDS_BLOCK_INVALID; break; case 2: rds.block |= V4L2_RDS_BLOCK_ERROR; rds.lsb = get_random_u8(); rds.msb = get_random_u8(); break; case 3: /* Skip block altogether */ if (i) continue; /* * Must make sure at least one block is * returned, otherwise the application * might think that end-of-file occurred. */ break; } } if (copy_to_user(buf + i, &rds, sizeof(rds))) { i = -EFAULT; break; } i += sizeof(rds); } mutex_unlock(&dev->mutex); return i; } __poll_t vivid_radio_rx_poll(struct file *file, struct poll_table_struct *wait) { return EPOLLIN | EPOLLRDNORM | v4l2_ctrl_poll(file, wait); } int vivid_radio_rx_enum_freq_bands(struct file *file, void *priv, struct v4l2_frequency_band *band) { if (band->tuner != 0) return -EINVAL; if (band->index >= TOT_BANDS) return -EINVAL; *band = vivid_radio_bands[band->index]; return 0; } int vivid_radio_rx_s_hw_freq_seek(struct file *file, void *priv, const struct v4l2_hw_freq_seek *a) { struct vivid_dev *dev = video_drvdata(file); unsigned low, high; unsigned freq; unsigned spacing; unsigned band; if (a->tuner) return -EINVAL; if (a->wrap_around && dev->radio_rx_hw_seek_mode == VIVID_HW_SEEK_BOUNDED) return -EINVAL; if (!a->wrap_around && dev->radio_rx_hw_seek_mode == VIVID_HW_SEEK_WRAP) return -EINVAL; if (!a->rangelow ^ !a->rangehigh) return -EINVAL; if (file->f_flags & O_NONBLOCK) return -EWOULDBLOCK; if (a->rangelow) { for (band = 0; band < TOT_BANDS; band++) if (a->rangelow >= vivid_radio_bands[band].rangelow && a->rangehigh <= vivid_radio_bands[band].rangehigh) break; if (band == TOT_BANDS) return -EINVAL; if (!dev->radio_rx_hw_seek_prog_lim && (a->rangelow != vivid_radio_bands[band].rangelow || a->rangehigh != vivid_radio_bands[band].rangehigh)) return -EINVAL; low = a->rangelow; high = a->rangehigh; } else { for (band = 0; band < TOT_BANDS; band++) if (dev->radio_rx_freq >= vivid_radio_bands[band].rangelow && dev->radio_rx_freq <= vivid_radio_bands[band].rangehigh) break; if (band == TOT_BANDS) return -EINVAL; low = vivid_radio_bands[band].rangelow; high = vivid_radio_bands[band].rangehigh; } spacing = band == BAND_AM ? 1600 : 16000; freq = clamp(dev->radio_rx_freq, low, high); if (a->seek_upward) { freq = spacing * (freq / spacing) + spacing; if (freq > high) { if (!a->wrap_around) return -ENODATA; freq = spacing * (low / spacing) + spacing; if (freq >= dev->radio_rx_freq) return -ENODATA; } } else { freq = spacing * ((freq + spacing - 1) / spacing) - spacing; if (freq < low) { if (!a->wrap_around) return -ENODATA; freq = spacing * ((high + spacing - 1) / spacing) - spacing; if (freq <= dev->radio_rx_freq) return -ENODATA; } } return 0; } int vivid_radio_rx_g_tuner(struct file *file, void *priv, struct v4l2_tuner *vt) { struct vivid_dev *dev = video_drvdata(file); int delta = 800; int sig_qual; if (vt->index > 0) return -EINVAL; strscpy(vt->name, "AM/FM/SW Receiver", sizeof(vt->name)); vt->capability = V4L2_TUNER_CAP_LOW | V4L2_TUNER_CAP_STEREO | V4L2_TUNER_CAP_FREQ_BANDS | V4L2_TUNER_CAP_RDS | (dev->radio_rx_rds_controls ? V4L2_TUNER_CAP_RDS_CONTROLS : V4L2_TUNER_CAP_RDS_BLOCK_IO) | (dev->radio_rx_hw_seek_prog_lim ? V4L2_TUNER_CAP_HWSEEK_PROG_LIM : 0); switch (dev->radio_rx_hw_seek_mode) { case VIVID_HW_SEEK_BOUNDED: vt->capability |= V4L2_TUNER_CAP_HWSEEK_BOUNDED; break; case VIVID_HW_SEEK_WRAP: vt->capability |= V4L2_TUNER_CAP_HWSEEK_WRAP; break; case VIVID_HW_SEEK_BOTH: vt->capability |= V4L2_TUNER_CAP_HWSEEK_WRAP | V4L2_TUNER_CAP_HWSEEK_BOUNDED; break; } vt->rangelow = AM_FREQ_RANGE_LOW; vt->rangehigh = FM_FREQ_RANGE_HIGH; sig_qual = dev->radio_rx_sig_qual; vt->signal = abs(sig_qual) > delta ? 0 : 0xffff - ((unsigned)abs(sig_qual) * 0xffff) / delta; vt->afc = sig_qual > delta ? 0 : sig_qual; if (abs(sig_qual) > delta) vt->rxsubchans = 0; else if (dev->radio_rx_freq < FM_FREQ_RANGE_LOW || vt->signal < 0x8000) vt->rxsubchans = V4L2_TUNER_SUB_MONO; else if (dev->radio_rds_loop && !(dev->radio_tx_subchans & V4L2_TUNER_SUB_STEREO)) vt->rxsubchans = V4L2_TUNER_SUB_MONO; else vt->rxsubchans = V4L2_TUNER_SUB_STEREO; if (dev->radio_rx_rds_enabled && (!dev->radio_rds_loop || (dev->radio_tx_subchans & V4L2_TUNER_SUB_RDS)) && dev->radio_rx_freq >= FM_FREQ_RANGE_LOW && vt->signal >= 0xc000) vt->rxsubchans |= V4L2_TUNER_SUB_RDS; if (dev->radio_rx_rds_controls) vivid_radio_rds_init(dev); vt->audmode = dev->radio_rx_audmode; return 0; } int vivid_radio_rx_s_tuner(struct file *file, void *priv, const struct v4l2_tuner *vt) { struct vivid_dev *dev = video_drvdata(file); if (vt->index) return -EINVAL; dev->radio_rx_audmode = vt->audmode >= V4L2_TUNER_MODE_STEREO; return 0; } |
| 4 1 1 8 7 6 5 5 8 5 5 4 5 1 2 2 1 1 2 2 2 1 1 1 2 2 2 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 | // SPDX-License-Identifier: GPL-2.0 /* * queue_stack_maps.c: BPF queue and stack maps * * Copyright (c) 2018 Politecnico di Torino */ #include <linux/bpf.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/btf_ids.h> #include "percpu_freelist.h" #include <asm/rqspinlock.h> #define QUEUE_STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; rqspinlock_t lock; u32 head, tail; u32 size; /* max_entries + 1 */ char elements[] __aligned(8); }; static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) { return container_of(map, struct bpf_queue_stack, map); } static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) { return qs->head == qs->tail; } static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) { u32 head = qs->head + 1; if (unlikely(head >= qs->size)) head = 0; return head == qs->tail; } /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ return -E2BIG; return 0; } static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; u64 size, queue_size; size = (u64) attr->max_entries + 1; queue_size = sizeof(*qs) + size * attr->value_size; qs = bpf_map_area_alloc(queue_size, numa_node); if (!qs) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&qs->map, attr); qs->size = size; raw_res_spin_lock_init(&qs->lock); return &qs->map; } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void queue_stack_map_free(struct bpf_map *map) { struct bpf_queue_stack *qs = bpf_queue_stack(map); bpf_map_area_free(qs); } static long __queue_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long flags; int err = 0; void *ptr; if (raw_res_spin_lock_irqsave(&qs->lock, flags)) return -EBUSY; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } ptr = &qs->elements[qs->tail * qs->map.value_size]; memcpy(value, ptr, qs->map.value_size); if (delete) { if (unlikely(++qs->tail >= qs->size)) qs->tail = 0; } out: raw_res_spin_unlock_irqrestore(&qs->lock, flags); return err; } static long __stack_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long flags; int err = 0; void *ptr; u32 index; if (raw_res_spin_lock_irqsave(&qs->lock, flags)) return -EBUSY; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } index = qs->head - 1; if (unlikely(index >= qs->size)) index = qs->size - 1; ptr = &qs->elements[index * qs->map.value_size]; memcpy(value, ptr, qs->map.value_size); if (delete) qs->head = index; out: raw_res_spin_unlock_irqrestore(&qs->lock, flags); return err; } /* Called from syscall or from eBPF program */ static long queue_map_peek_elem(struct bpf_map *map, void *value) { return __queue_map_get(map, value, false); } /* Called from syscall or from eBPF program */ static long stack_map_peek_elem(struct bpf_map *map, void *value) { return __stack_map_get(map, value, false); } /* Called from syscall or from eBPF program */ static long queue_map_pop_elem(struct bpf_map *map, void *value) { return __queue_map_get(map, value, true); } /* Called from syscall or from eBPF program */ static long stack_map_pop_elem(struct bpf_map *map, void *value) { return __stack_map_get(map, value, true); } /* Called from syscall or from eBPF program */ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, u64 flags) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long irq_flags; int err = 0; void *dst; /* BPF_EXIST is used to force making room for a new element in case the * map is full */ bool replace = (flags & BPF_EXIST); /* Check supported flags for queue and stack maps */ if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; if (raw_res_spin_lock_irqsave(&qs->lock, irq_flags)) return -EBUSY; if (queue_stack_map_is_full(qs)) { if (!replace) { err = -E2BIG; goto out; } /* advance tail pointer to overwrite oldest element */ if (unlikely(++qs->tail >= qs->size)) qs->tail = 0; } dst = &qs->elements[qs->head * qs->map.value_size]; memcpy(dst, value, qs->map.value_size); if (unlikely(++qs->head >= qs->size)) qs->head = 0; out: raw_res_spin_unlock_irqrestore(&qs->lock, irq_flags); return err; } /* Called from syscall or from eBPF program */ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) { return NULL; } /* Called from syscall or from eBPF program */ static long queue_stack_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ static long queue_stack_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } /* Called from syscall */ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -EINVAL; } static u64 queue_stack_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_queue_stack); usage += ((u64)map->max_entries + 1) * map->value_size; return usage; } BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack) const struct bpf_map_ops queue_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, .map_lookup_elem = queue_stack_map_lookup_elem, .map_update_elem = queue_stack_map_update_elem, .map_delete_elem = queue_stack_map_delete_elem, .map_push_elem = queue_stack_map_push_elem, .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, .map_mem_usage = queue_stack_map_mem_usage, .map_btf_id = &queue_map_btf_ids[0], }; const struct bpf_map_ops stack_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, .map_lookup_elem = queue_stack_map_lookup_elem, .map_update_elem = queue_stack_map_update_elem, .map_delete_elem = queue_stack_map_delete_elem, .map_push_elem = queue_stack_map_push_elem, .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, .map_mem_usage = queue_stack_map_mem_usage, .map_btf_id = &queue_map_btf_ids[0], }; |
| 26 26 5047 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SOCKET_H #define _LINUX_SOCKET_H #include <asm/socket.h> /* arch-dependent defines */ #include <linux/sockios.h> /* the SIOCxxx I/O controls */ #include <linux/uio.h> /* iovec support */ #include <linux/types.h> /* pid_t */ #include <linux/compiler.h> /* __user */ #include <uapi/linux/socket.h> struct file; struct pid; struct cred; struct socket; struct sock; struct sk_buff; struct proto_accept_arg; #define __sockaddr_check_size(size) \ BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) #ifdef CONFIG_PROC_FS struct seq_file; extern void socket_seq_show(struct seq_file *seq); #endif typedef __kernel_sa_family_t sa_family_t; /* * 1003.1g requires sa_family_t and that sa_data is char. */ struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ union { char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ DECLARE_FLEX_ARRAY(char, sa_data); }; }; struct linger { int l_onoff; /* Linger active */ int l_linger; /* How long to linger for */ }; #define sockaddr_storage __kernel_sockaddr_storage /* * As we do 4.4BSD message passing we use a 4.4BSD message passing * system, not 4.3. Thus msg_accrights(len) are now missing. They * belong in an obscure libc emulation or the bin. */ struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ int msg_inq; /* output, data left in socket */ struct iov_iter msg_iter; /* data */ /* * Ancillary data. msg_control_user is the user buffer used for the * recv* side when msg_control_is_user is set, msg_control is the kernel * buffer used for all other cases. */ union { void *msg_control; void __user *msg_control_user; }; bool msg_control_is_user : 1; bool msg_get_inq : 1;/* return INQ after receive */ unsigned int msg_flags; /* flags on received message */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ struct ubuf_info *msg_ubuf; int (*sg_from_iter)(struct sk_buff *skb, struct iov_iter *from, size_t length); }; struct user_msghdr { void __user *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ struct iovec __user *msg_iov; /* scatter/gather array */ __kernel_size_t msg_iovlen; /* # elements in msg_iov */ void __user *msg_control; /* ancillary data */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ unsigned int msg_flags; /* flags on received message */ }; /* For recvmmsg/sendmmsg */ struct mmsghdr { struct user_msghdr msg_hdr; unsigned int msg_len; }; /* * POSIX 1003.1g - ancillary data object information * Ancillary data consists of a sequence of pairs of * (cmsghdr, cmsg_data[]) */ struct cmsghdr { __kernel_size_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ }; /* * Ancillary data object information MACROS * Table 5-14 of POSIX 1003.1g */ #define __CMSG_NXTHDR(ctl, len, cmsg) __cmsg_nxthdr((ctl),(len),(cmsg)) #define CMSG_NXTHDR(mhdr, cmsg) cmsg_nxthdr((mhdr), (cmsg)) #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) #define CMSG_DATA(cmsg) \ ((void *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_USER_DATA(cmsg) \ ((void __user *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len)) #define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len)) #define __CMSG_FIRSTHDR(ctl,len) ((len) >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(ctl) : \ (struct cmsghdr *)NULL) #define CMSG_FIRSTHDR(msg) __CMSG_FIRSTHDR((msg)->msg_control, (msg)->msg_controllen) #define CMSG_OK(mhdr, cmsg) ((cmsg)->cmsg_len >= sizeof(struct cmsghdr) && \ (cmsg)->cmsg_len <= (unsigned long) \ ((mhdr)->msg_controllen - \ ((char *)(cmsg) - (char *)(mhdr)->msg_control))) #define for_each_cmsghdr(cmsg, msg) \ for (cmsg = CMSG_FIRSTHDR(msg); \ cmsg; \ cmsg = CMSG_NXTHDR(msg, cmsg)) /* * Get the next cmsg header * * PLEASE, do not touch this function. If you think, that it is * incorrect, grep kernel sources and think about consequences * before trying to improve it. * * Now it always returns valid, not truncated ancillary object * HEADER. But caller still MUST check, that cmsg->cmsg_len is * inside range, given by msg->msg_controllen before using * ancillary object DATA. --ANK (980731) */ static inline struct cmsghdr * __cmsg_nxthdr(void *__ctl, __kernel_size_t __size, struct cmsghdr *__cmsg) { struct cmsghdr * __ptr; __ptr = (struct cmsghdr*)(((unsigned char *) __cmsg) + CMSG_ALIGN(__cmsg->cmsg_len)); if ((unsigned long)((char*)(__ptr+1) - (char *) __ctl) > __size) return (struct cmsghdr *)0; return __ptr; } static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr *__cmsg) { return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } static inline size_t msg_data_left(const struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); } /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* rw: access rights (array of int) */ #define SCM_CREDENTIALS 0x02 /* rw: struct ucred */ #define SCM_SECURITY 0x03 /* rw: security label */ #define SCM_PIDFD 0x04 /* ro: pidfd (int) */ struct ucred { __u32 pid; __u32 uid; __u32 gid; }; /* Supported address families. */ #define AF_UNSPEC 0 #define AF_UNIX 1 /* Unix domain sockets */ #define AF_LOCAL 1 /* POSIX name for AF_UNIX */ #define AF_INET 2 /* Internet IP Protocol */ #define AF_AX25 3 /* Amateur Radio AX.25 */ #define AF_IPX 4 /* Novell IPX */ #define AF_APPLETALK 5 /* AppleTalk DDP */ #define AF_NETROM 6 /* Amateur Radio NET/ROM */ #define AF_BRIDGE 7 /* Multiprotocol bridge */ #define AF_ATMPVC 8 /* ATM PVCs */ #define AF_X25 9 /* Reserved for X.25 project */ #define AF_INET6 10 /* IP version 6 */ #define AF_ROSE 11 /* Amateur Radio X.25 PLP */ #define AF_DECnet 12 /* Reserved for DECnet project */ #define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/ #define AF_SECURITY 14 /* Security callback pseudo AF */ #define AF_KEY 15 /* PF_KEY key management API */ #define AF_NETLINK 16 #define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */ #define AF_PACKET 17 /* Packet family */ #define AF_ASH 18 /* Ash */ #define AF_ECONET 19 /* Acorn Econet */ #define AF_ATMSVC 20 /* ATM SVCs */ #define AF_RDS 21 /* RDS sockets */ #define AF_SNA 22 /* Linux SNA Project (nutters!) */ #define AF_IRDA 23 /* IRDA sockets */ #define AF_PPPOX 24 /* PPPoX sockets */ #define AF_WANPIPE 25 /* Wanpipe API Sockets */ #define AF_LLC 26 /* Linux LLC */ #define AF_IB 27 /* Native InfiniBand address */ #define AF_MPLS 28 /* MPLS */ #define AF_CAN 29 /* Controller Area Network */ #define AF_TIPC 30 /* TIPC sockets */ #define AF_BLUETOOTH 31 /* Bluetooth sockets */ #define AF_IUCV 32 /* IUCV sockets */ #define AF_RXRPC 33 /* RxRPC sockets */ #define AF_ISDN 34 /* mISDN sockets */ #define AF_PHONET 35 /* Phonet sockets */ #define AF_IEEE802154 36 /* IEEE802154 sockets */ #define AF_CAIF 37 /* CAIF sockets */ #define AF_ALG 38 /* Algorithm sockets */ #define AF_NFC 39 /* NFC sockets */ #define AF_VSOCK 40 /* vSockets */ #define AF_KCM 41 /* Kernel Connection Multiplexor*/ #define AF_QIPCRTR 42 /* Qualcomm IPC Router */ #define AF_SMC 43 /* smc sockets: reserve number for * PF_SMC protocol family that * reuses AF_INET address family */ #define AF_XDP 44 /* XDP sockets */ #define AF_MCTP 45 /* Management component * transport protocol */ #define AF_MAX 46 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC #define PF_UNIX AF_UNIX #define PF_LOCAL AF_LOCAL #define PF_INET AF_INET #define PF_AX25 AF_AX25 #define PF_IPX AF_IPX #define PF_APPLETALK AF_APPLETALK #define PF_NETROM AF_NETROM #define PF_BRIDGE AF_BRIDGE #define PF_ATMPVC AF_ATMPVC #define PF_X25 AF_X25 #define PF_INET6 AF_INET6 #define PF_ROSE AF_ROSE #define PF_DECnet AF_DECnet #define PF_NETBEUI AF_NETBEUI #define PF_SECURITY AF_SECURITY #define PF_KEY AF_KEY #define PF_NETLINK AF_NETLINK #define PF_ROUTE AF_ROUTE #define PF_PACKET AF_PACKET #define PF_ASH AF_ASH #define PF_ECONET AF_ECONET #define PF_ATMSVC AF_ATMSVC #define PF_RDS AF_RDS #define PF_SNA AF_SNA #define PF_IRDA AF_IRDA #define PF_PPPOX AF_PPPOX #define PF_WANPIPE AF_WANPIPE #define PF_LLC AF_LLC #define PF_IB AF_IB #define PF_MPLS AF_MPLS #define PF_CAN AF_CAN #define PF_TIPC AF_TIPC #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IUCV AF_IUCV #define PF_RXRPC AF_RXRPC #define PF_ISDN AF_ISDN #define PF_PHONET AF_PHONET #define PF_IEEE802154 AF_IEEE802154 #define PF_CAIF AF_CAIF #define PF_ALG AF_ALG #define PF_NFC AF_NFC #define PF_VSOCK AF_VSOCK #define PF_KCM AF_KCM #define PF_QIPCRTR AF_QIPCRTR #define PF_SMC AF_SMC #define PF_XDP AF_XDP #define PF_MCTP AF_MCTP #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ #define SOMAXCONN 4096 /* Flags we can use with send/ and recv. Added those for 1003.1g not all are supported yet */ #define MSG_OOB 1 #define MSG_PEEK 2 #define MSG_DONTROUTE 4 #define MSG_TRYHARD 4 /* Synonym for MSG_DONTROUTE for DECnet */ #define MSG_CTRUNC 8 #define MSG_PROBE 0x10 /* Do not send. Only probe path f.e. for MTU */ #define MSG_TRUNC 0x20 #define MSG_DONTWAIT 0x40 /* Nonblocking io */ #define MSG_EOR 0x80 /* End of record */ #define MSG_WAITALL 0x100 /* Wait for a full request */ #define MSG_FIN 0x200 #define MSG_SYN 0x400 #define MSG_CONFIRM 0x800 /* Confirm path validity */ #define MSG_RST 0x1000 #define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */ #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ #define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ #define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry * plain text and require encryption */ #define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through SCM_RIGHTS */ #if defined(CONFIG_COMPAT) #define MSG_CMSG_COMPAT 0x80000000 /* This message needs 32 bit fixups */ #else #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif /* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ #define MSG_INTERNAL_SENDMSG_FLAGS \ (MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_DECRYPTED) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 /* #define SOL_ICMP 1 No-no-no! Due to Linux :-) we cannot use SOL_ICMP=1 */ #define SOL_TCP 6 #define SOL_UDP 17 #define SOL_IPV6 41 #define SOL_ICMPV6 58 #define SOL_SCTP 132 #define SOL_UDPLITE 136 /* UDP-Lite (RFC 3828) */ #define SOL_RAW 255 #define SOL_IPX 256 #define SOL_AX25 257 #define SOL_ATALK 258 #define SOL_NETROM 259 #define SOL_ROSE 260 #define SOL_DECNET 261 #define SOL_X25 262 #define SOL_PACKET 263 #define SOL_ATM 264 /* ATM layer (cell level) */ #define SOL_AAL 265 /* ATM Adaption Layer (packet level) */ #define SOL_IRDA 266 #define SOL_NETBEUI 267 #define SOL_LLC 268 #define SOL_DCCP 269 #define SOL_NETLINK 270 #define SOL_TIPC 271 #define SOL_RXRPC 272 #define SOL_PPPOL2TP 273 #define SOL_BLUETOOTH 274 #define SOL_PNPIPE 275 #define SOL_RDS 276 #define SOL_IUCV 277 #define SOL_CAIF 278 #define SOL_ALG 279 #define SOL_NFC 280 #define SOL_KCM 281 #define SOL_TLS 282 #define SOL_XDP 283 #define SOL_MPTCP 284 #define SOL_MCTP 285 #define SOL_SMC 286 #define SOL_VSOCK 287 /* IPX options */ #define IPX_TYPE 1 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); extern int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, void *data); struct timespec64; struct __kernel_timespec; struct old_timespec32; struct scm_timestamping_internal { struct timespec64 ts[3]; }; extern void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss); extern void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss); /* The __sys_...msg variants allow MSG_CMSG_COMPAT iff * forbid_cmsg_compat==false */ extern long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat); extern long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat); extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct __kernel_timespec __user *timeout, struct old_timespec32 __user *timeout32); extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, bool forbid_cmsg_compat); extern long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, unsigned int flags); extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags); extern int __copy_msghdr(struct msghdr *kmsg, struct user_msghdr *umsg, struct sockaddr __user **save_addr); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, struct sockaddr __user *addr, int __user *addr_len); extern int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len); extern struct file *do_accept(struct file *file, struct proto_accept_arg *arg, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); extern int __sys_listen_socket(struct socket *sock, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); extern int __sys_shutdown_sock(struct socket *sock, int how); extern int __sys_shutdown(int fd, int how); #endif /* _LINUX_SOCKET_H */ |
| 400 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM fib6 #if !defined(_TRACE_FIB6_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FIB6_H #include <linux/in6.h> #include <net/flow.h> #include <net/ip6_fib.h> #include <linux/tracepoint.h> TRACE_EVENT(fib6_table_lookup, TP_PROTO(const struct net *net, const struct fib6_result *res, struct fib6_table *table, const struct flowi6 *flp), TP_ARGS(net, res, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) __field( int, err ) __field( int, oif ) __field( int, iif ) __field( u32, flowlabel ) __field( __u8, tos ) __field( __u8, scope ) __field( __u8, flags ) __array( __u8, src, 16 ) __array( __u8, dst, 16 ) __field( u16, sport ) __field( u16, dport ) __field( u8, proto ) __field( u8, rt_type ) __array( char, name, IFNAMSIZ ) __array( __u8, gw, 16 ) ), TP_fast_assign( struct in6_addr *in6; __entry->tb_id = table->tb6_id; __entry->err = ip6_rt_type_to_error(res->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->flowlabel = ntohl(flowi6_get_flowlabel(flp)); __entry->tos = ip6_tclass(flp->flowlabel); __entry->scope = flp->flowi6_scope; __entry->flags = flp->flowi6_flags; in6 = (struct in6_addr *)__entry->src; *in6 = flp->saddr; in6 = (struct in6_addr *)__entry->dst; *in6 = flp->daddr; __entry->proto = flp->flowi6_proto; if (__entry->proto == IPPROTO_TCP || __entry->proto == IPPROTO_UDP) { __entry->sport = ntohs(flp->fl6_sport); __entry->dport = ntohs(flp->fl6_dport); } else { __entry->sport = 0; __entry->dport = 0; } if (res->nh && res->nh->fib_nh_dev) { strscpy(__entry->name, res->nh->fib_nh_dev->name, IFNAMSIZ); } else { strcpy(__entry->name, "-"); } if (res->f6i == net->ipv6.fib6_null_entry) { in6 = (struct in6_addr *)__entry->gw; *in6 = in6addr_any; } else if (res->nh) { in6 = (struct in6_addr *)__entry->gw; *in6 = res->nh->fib_nh_gw6; } ), TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u flowlabel %#x tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, __entry->src, __entry->sport, __entry->dst, __entry->dport, __entry->flowlabel, __entry->tos, __entry->scope, __entry->flags, __entry->name, __entry->gw, __entry->err) ); #endif /* _TRACE_FIB6_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 212 4 1 3 3 3 3 3 212 212 212 212 212 212 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 | /* * cgroup_freezer.c - control group freezer subsystem * * Copyright IBM Corporation, 2007 * * Author : Cedric Le Goater <clg@fr.ibm.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include <linux/export.h> #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/mutex.h> #include <linux/cpu.h> /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of * its ancestors has FREEZING_SELF set. */ enum freezer_state_flags { CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ /* mask for all FREEZING flags */ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, }; struct freezer { struct cgroup_subsys_state css; unsigned int state; }; static DEFINE_MUTEX(freezer_mutex); static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) { return css ? container_of(css, struct freezer, css) : NULL; } static inline struct freezer *task_freezer(struct task_struct *task) { return css_freezer(task_css(task, freezer_cgrp_id)); } static struct freezer *parent_freezer(struct freezer *freezer) { return css_freezer(freezer->css.parent); } bool cgroup_freezing(struct task_struct *task) { bool ret; rcu_read_lock(); ret = task_freezer(task)->state & CGROUP_FREEZING; rcu_read_unlock(); return ret; } static const char *freezer_state_strs(unsigned int state) { if (state & CGROUP_FROZEN) return "FROZEN"; if (state & CGROUP_FREEZING) return "FREEZING"; return "THAWED"; }; static struct cgroup_subsys_state * freezer_css_alloc(struct cgroup_subsys_state *parent_css) { struct freezer *freezer; freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); if (!freezer) return ERR_PTR(-ENOMEM); return &freezer->css; } /** * freezer_css_online - commit creation of a freezer css * @css: css being created * * We're committing to creation of @css. Mark it online and inherit * parent's freezing state while holding cpus read lock and freezer_mutex. */ static int freezer_css_online(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); cpus_read_lock(); mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; static_branch_inc_cpuslocked(&freezer_active); } mutex_unlock(&freezer_mutex); cpus_read_unlock(); return 0; } /** * freezer_css_offline - initiate destruction of a freezer css * @css: css being destroyed * * @css is going away. Mark it dead and decrement freezer_active if * it was holding one. */ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); cpus_read_lock(); mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) static_branch_dec_cpuslocked(&freezer_active); freezer->state = 0; mutex_unlock(&freezer_mutex); cpus_read_unlock(); } static void freezer_css_free(struct cgroup_subsys_state *css) { kfree(css_freezer(css)); } /* * Tasks can be migrated into a different freezer anytime regardless of its * current state. freezer_attach() is responsible for making new tasks * conform to the current state. * * Freezer state changes and task migration are synchronized via * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ static void freezer_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); /* * Make the new tasks conform to the current state of @new_css. * For simplicity, when migrating any task to a FROZEN cgroup, we * revert it to FREEZING and let update_if_frozen() determine the * correct state later. * * Tasks in @tset are on @new_css but may not conform to its * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ cgroup_taskset_for_each(task, new_css, tset) { struct freezer *freezer = css_freezer(new_css); if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; freezer = parent_freezer(freezer); } freeze_task(task); } } mutex_unlock(&freezer_mutex); } /** * freezer_fork - cgroup post fork callback * @task: a task which has just been forked * * @task has just been created and should conform to the current state of * the cgroup_freezer it belongs to. This function may race against * freezer_attach(). Losing to freezer_attach() means that we don't have * to do anything as freezer_attach() will put @task into the appropriate * state. */ static void freezer_fork(struct task_struct *task) { struct freezer *freezer; /* * The root cgroup is non-freezable, so we can skip locking the * freezer. This is safe regardless of race with task migration. * If we didn't race or won, skipping is obviously the right thing * to do. If we lost and root is the new cgroup, noop is still the * right thing to do. */ if (task_css_is_root(task, freezer_cgrp_id)) return; mutex_lock(&freezer_mutex); rcu_read_lock(); freezer = task_freezer(task); if (freezer->state & CGROUP_FREEZING) freeze_task(task); rcu_read_unlock(); mutex_unlock(&freezer_mutex); } /** * update_if_frozen - update whether a cgroup finished freezing * @css: css of interest * * Once FREEZING is initiated, transition to FROZEN is lazily updated by * calling this function. If the current state is FREEZING but not FROZEN, * this function checks whether all tasks of this cgroup and the descendant * cgroups finished freezing and, if so, sets FROZEN. * * The caller is responsible for grabbing RCU read lock and calling * update_if_frozen() on all descendants prior to invoking this function. * * Task states and freezer state might disagree while tasks are being * migrated into or out of @css, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct cgroup_subsys_state *pos; struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZING) || (freezer->state & CGROUP_FROZEN)) return; /* are all (live) children frozen? */ rcu_read_lock(); css_for_each_child(pos, css) { struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && !(child->state & CGROUP_FROZEN)) { rcu_read_unlock(); return; } } rcu_read_unlock(); /* are all tasks frozen? */ css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { if (freezing(task) && !frozen(task)) goto out_iter_end; } freezer->state |= CGROUP_FROZEN; out_iter_end: css_task_iter_end(&it); } static int freezer_read(struct seq_file *m, void *v) { struct cgroup_subsys_state *css = seq_css(m), *pos; mutex_lock(&freezer_mutex); rcu_read_lock(); /* update states bottom-up */ css_for_each_descendant_post(pos, css) { if (!css_tryget_online(pos)) continue; rcu_read_unlock(); update_if_frozen(pos); rcu_read_lock(); css_put(pos); } rcu_read_unlock(); mutex_unlock(&freezer_mutex); seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); return 0; } static void freeze_cgroup(struct freezer *freezer) { struct css_task_iter it; struct task_struct *task; css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) freeze_task(task); css_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) { struct css_task_iter it; struct task_struct *task; css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) __thaw_task(task); css_task_iter_end(&it); } /** * freezer_apply_state - apply state change to a single cgroup_freezer * @freezer: freezer to apply state change to * @freeze: whether to freeze or unfreeze * @state: CGROUP_FREEZING_* flag to set or clear * * Set or clear @state on @cgroup according to @freeze, and perform * freezing or thawing as necessary. */ static void freezer_apply_state(struct freezer *freezer, bool freeze, unsigned int state) { /* also synchronizes against task migration, see freezer_attach() */ lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZER_ONLINE)) return; if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) static_branch_inc_cpuslocked(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { bool was_freezing = freezer->state & CGROUP_FREEZING; freezer->state &= ~state; if (!(freezer->state & CGROUP_FREEZING)) { freezer->state &= ~CGROUP_FROZEN; if (was_freezing) static_branch_dec_cpuslocked(&freezer_active); unfreeze_cgroup(freezer); } } } /** * freezer_change_state - change the freezing state of a cgroup_freezer * @freezer: freezer of interest * @freeze: whether to freeze or thaw * * Freeze or thaw @freezer according to @freeze. The operations are * recursive - all descendants of @freezer will be affected. */ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; cpus_read_lock(); /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as * CGROUP_FREEZING_PARENT. */ mutex_lock(&freezer_mutex); rcu_read_lock(); css_for_each_descendant_pre(pos, &freezer->css) { struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); if (!css_tryget_online(pos)) continue; rcu_read_unlock(); if (pos_f == freezer) freezer_apply_state(pos_f, freeze, CGROUP_FREEZING_SELF); else freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, CGROUP_FREEZING_PARENT); rcu_read_lock(); css_put(pos); } rcu_read_unlock(); mutex_unlock(&freezer_mutex); cpus_read_unlock(); } static ssize_t freezer_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { bool freeze; buf = strstrip(buf); if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) { pr_info_once("Freezing with imperfect legacy cgroup freezer. " "See cgroup.freeze of cgroup v2\n"); freeze = true; } else return -EINVAL; freezer_change_state(css_freezer(of_css(of)), freeze); return nbytes; } static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_SELF); } static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_PARENT); } static struct cftype files[] = { { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = freezer_read, .write = freezer_write, }, { .name = "self_freezing", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = freezer_self_freezing_read, }, { .name = "parent_freezing", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = freezer_parent_freezing_read, }, { } /* terminate */ }; struct cgroup_subsys freezer_cgrp_subsys = { .css_alloc = freezer_css_alloc, .css_online = freezer_css_online, .css_offline = freezer_css_offline, .css_free = freezer_css_free, .attach = freezer_attach, .fork = freezer_fork, .legacy_cftypes = files, }; |
| 19788 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/fault-inject.h> #include <linux/fault-inject-usercopy.h> static struct { struct fault_attr attr; } fail_usercopy = { .attr = FAULT_ATTR_INITIALIZER, }; static int __init setup_fail_usercopy(char *str) { return setup_fault_attr(&fail_usercopy.attr, str); } __setup("fail_usercopy=", setup_fail_usercopy); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_usercopy_debugfs(void) { struct dentry *dir; dir = fault_create_debugfs_attr("fail_usercopy", NULL, &fail_usercopy.attr); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_usercopy_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ bool should_fail_usercopy(void) { return should_fail(&fail_usercopy.attr, 1); } EXPORT_SYMBOL_GPL(should_fail_usercopy); |
| 297 389 244 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_HOST_H #define _SCSI_SCSI_HOST_H #include <linux/device.h> #include <linux/list.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/blk-mq.h> #include <scsi/scsi.h> struct block_device; struct completion; struct module; struct scsi_cmnd; struct scsi_device; struct scsi_target; struct Scsi_Host; struct scsi_transport_template; #define SG_ALL SG_CHUNK_SIZE #define MODE_UNKNOWN 0x00 #define MODE_INITIATOR 0x01 #define MODE_TARGET 0x02 /** * enum scsi_timeout_action - How to handle a command that timed out. * @SCSI_EH_DONE: The command has already been completed. * @SCSI_EH_RESET_TIMER: Reset the timer and continue waiting for completion. * @SCSI_EH_NOT_HANDLED: The command has not yet finished. Abort the command. */ enum scsi_timeout_action { SCSI_EH_DONE, SCSI_EH_RESET_TIMER, SCSI_EH_NOT_HANDLED, }; struct scsi_host_template { /* * Put fields referenced in IO submission path together in * same cacheline */ /* * Additional per-command data allocated for the driver. */ unsigned int cmd_size; /* * The queuecommand function is used to queue up a scsi * command block to the LLDD. When the driver finished * processing the command the done callback is invoked. * * If queuecommand returns 0, then the driver has accepted the * command. It must also push it to the HBA if the scsi_cmnd * flag SCMD_LAST is set, or if the driver does not implement * commit_rqs. The done() function must be called on the command * when the driver has finished with it. (you may call done on the * command before queuecommand returns, but in this case you * *must* return 0 from queuecommand). * * Queuecommand may also reject the command, in which case it may * not touch the command and must not call done() for it. * * There are two possible rejection returns: * * SCSI_MLQUEUE_DEVICE_BUSY: Block this device temporarily, but * allow commands to other devices serviced by this host. * * SCSI_MLQUEUE_HOST_BUSY: Block all devices served by this * host temporarily. * * For compatibility, any other non-zero return is treated the * same as SCSI_MLQUEUE_HOST_BUSY. * * NOTE: "temporarily" means either until the next command for# * this device/host completes, or a period of time determined by * I/O pressure in the system if there are no other outstanding * commands. * * STATUS: REQUIRED */ int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *); /* * The commit_rqs function is used to trigger a hardware * doorbell after some requests have been queued with * queuecommand, when an error is encountered before sending * the request with SCMD_LAST set. * * STATUS: OPTIONAL */ void (*commit_rqs)(struct Scsi_Host *, u16); struct module *module; const char *name; /* * The info function will return whatever useful information the * developer sees fit. If not provided, then the name field will * be used instead. * * Status: OPTIONAL */ const char *(*info)(struct Scsi_Host *); /* * Ioctl interface * * Status: OPTIONAL */ int (*ioctl)(struct scsi_device *dev, unsigned int cmd, void __user *arg); #ifdef CONFIG_COMPAT /* * Compat handler. Handle 32bit ABI. * When unknown ioctl is passed return -ENOIOCTLCMD. * * Status: OPTIONAL */ int (*compat_ioctl)(struct scsi_device *dev, unsigned int cmd, void __user *arg); #endif int (*init_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd); int (*exit_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd); /* * This is an error handling strategy routine. You don't need to * define one of these if you don't want to - there is a default * routine that is present that should work in most cases. For those * driver authors that have the inclination and ability to write their * own strategy routine, this is where it is specified. Note - the * strategy routine is *ALWAYS* run in the context of the kernel eh * thread. Thus you are guaranteed to *NOT* be in an interrupt * handler when you execute this, and you are also guaranteed to * *NOT* have any other commands being queued while you are in the * strategy routine. When you return from this function, operations * return to normal. * * See scsi_error.c scsi_unjam_host for additional comments about * what this function should and should not be attempting to do. * * Status: REQUIRED (at least one of them) */ int (* eh_abort_handler)(struct scsi_cmnd *); int (* eh_device_reset_handler)(struct scsi_cmnd *); int (* eh_target_reset_handler)(struct scsi_cmnd *); int (* eh_bus_reset_handler)(struct scsi_cmnd *); int (* eh_host_reset_handler)(struct scsi_cmnd *); /* * Before the mid layer attempts to scan for a new device where none * currently exists, it will call this entry in your driver. Should * your driver need to allocate any structs or perform any other init * items in order to send commands to a currently unused target/lun * combo, then this is where you can perform those allocations. This * is specifically so that drivers won't have to perform any kind of * "is this a new device" checks in their queuecommand routine, * thereby making the hot path a bit quicker. * * Return values: 0 on success, non-0 on failure * * Deallocation: If we didn't find any devices at this ID, you will * get an immediate call to sdev_destroy(). If we find something * here then you will get a call to sdev_configure(), then the * device will be used for however long it is kept around, then when * the device is removed from the system (or * possibly at reboot * time), you will then get a call to sdev_destroy(). This is * assuming you implement sdev_configure and sdev_destroy. * However, if you allocate memory and hang it off the device struct, * then you must implement the sdev_destroy() routine at a minimum * in order to avoid leaking memory * each time a device is tore down. * * Status: OPTIONAL */ int (* sdev_init)(struct scsi_device *); /* * Once the device has responded to an INQUIRY and we know the * device is online, we call into the low level driver with the * struct scsi_device *. If the low level device driver implements * this function, it *must* perform the task of setting the queue * depth on the device. All other tasks are optional and depend * on what the driver supports and various implementation details. * * Things currently recommended to be handled at this time include: * * 1. Setting the device queue depth. Proper setting of this is * described in the comments for scsi_change_queue_depth. * 2. Determining if the device supports the various synchronous * negotiation protocols. The device struct will already have * responded to INQUIRY and the results of the standard items * will have been shoved into the various device flag bits, eg. * device->sdtr will be true if the device supports SDTR messages. * 3. Allocating command structs that the device will need. * 4. Setting the default timeout on this device (if needed). * 5. Anything else the low level driver might want to do on a device * specific setup basis... * 6. Return 0 on success, non-0 on error. The device will be marked * as offline on error so that no access will occur. If you return * non-0, your sdev_destroy routine will never get called for this * device, so don't leave any loose memory hanging around, clean * up after yourself before returning non-0 * * Status: OPTIONAL */ int (* sdev_configure)(struct scsi_device *, struct queue_limits *lim); /* * Immediately prior to deallocating the device and after all activity * has ceased the mid layer calls this point so that the low level * driver may completely detach itself from the scsi device and vice * versa. The low level driver is responsible for freeing any memory * it allocated in the sdev_init or sdev_configure calls. * * Status: OPTIONAL */ void (* sdev_destroy)(struct scsi_device *); /* * Before the mid layer attempts to scan for a new device attached * to a target where no target currently exists, it will call this * entry in your driver. Should your driver need to allocate any * structs or perform any other init items in order to send commands * to a currently unused target, then this is where you can perform * those allocations. * * Return values: 0 on success, non-0 on failure * * Status: OPTIONAL */ int (* target_alloc)(struct scsi_target *); /* * Immediately prior to deallocating the target structure, and * after all activity to attached scsi devices has ceased, the * midlayer calls this point so that the driver may deallocate * and terminate any references to the target. * * Note: This callback is called with the host lock held and hence * must not sleep. * * Status: OPTIONAL */ void (* target_destroy)(struct scsi_target *); /* * If a host has the ability to discover targets on its own instead * of scanning the entire bus, it can fill in this function and * call scsi_scan_host(). This function will be called periodically * until it returns 1 with the scsi_host and the elapsed time of * the scan in jiffies. * * Status: OPTIONAL */ int (* scan_finished)(struct Scsi_Host *, unsigned long); /* * If the host wants to be called before the scan starts, but * after the midlayer has set up ready for the scan, it can fill * in this function. * * Status: OPTIONAL */ void (* scan_start)(struct Scsi_Host *); /* * Fill in this function to allow the queue depth of this host * to be changeable (on a per device basis). Returns either * the current queue depth setting (may be different from what * was passed in) or an error. An error should only be * returned if the requested depth is legal but the driver was * unable to set it. If the requested depth is illegal, the * driver should set and return the closest legal queue depth. * * Status: OPTIONAL */ int (* change_queue_depth)(struct scsi_device *, int); /* * This functions lets the driver expose the queue mapping * to the block layer. * * Status: OPTIONAL */ void (* map_queues)(struct Scsi_Host *shost); /* * SCSI interface of blk_poll - poll for IO completions. * Only applicable if SCSI LLD exposes multiple h/w queues. * * Return value: Number of completed entries found. * * Status: OPTIONAL */ int (* mq_poll)(struct Scsi_Host *shost, unsigned int queue_num); /* * Check if scatterlists need to be padded for DMA draining. * * Status: OPTIONAL */ bool (* dma_need_drain)(struct request *rq); /* * This function determines the BIOS parameters for a given * harddisk. These tend to be numbers that are made up by * the host adapter. Parameters: * size, device, list (heads, sectors, cylinders) * * Status: OPTIONAL */ int (* bios_param)(struct scsi_device *, struct gendisk *, sector_t, int []); /* * This function is called when one or more partitions on the * device reach beyond the end of the device. * * Status: OPTIONAL */ void (*unlock_native_capacity)(struct scsi_device *); /* * Can be used to export driver statistics and other infos to the * world outside the kernel ie. userspace and it also provides an * interface to feed the driver with information. * * Status: OBSOLETE */ int (*show_info)(struct seq_file *, struct Scsi_Host *); int (*write_info)(struct Scsi_Host *, char *, int); /* * This is an optional routine that allows the transport to become * involved when a scsi io timer fires. The return value tells the * timer routine how to finish the io timeout handling. * * Status: OPTIONAL */ enum scsi_timeout_action (*eh_timed_out)(struct scsi_cmnd *); /* * Optional routine that allows the transport to decide if a cmd * is retryable. Return true if the transport is in a state the * cmd should be retried on. */ bool (*eh_should_retry_cmd)(struct scsi_cmnd *scmd); /* This is an optional routine that allows transport to initiate * LLD adapter or firmware reset using sysfs attribute. * * Return values: 0 on success, -ve value on failure. * * Status: OPTIONAL */ int (*host_reset)(struct Scsi_Host *shost, int reset_type); #define SCSI_ADAPTER_RESET 1 #define SCSI_FIRMWARE_RESET 2 /* * Name of proc directory */ const char *proc_name; /* * This determines if we will use a non-interrupt driven * or an interrupt driven scheme. It is set to the maximum number * of simultaneous commands a single hw queue in HBA will accept. */ int can_queue; /* * In many instances, especially where disconnect / reconnect are * supported, our host also has an ID on the SCSI bus. If this is * the case, then it must be reserved. Please set this_id to -1 if * your setup is in single initiator mode, and the host lacks an * ID. */ int this_id; /* * This determines the degree to which the host adapter is capable * of scatter-gather. */ unsigned short sg_tablesize; unsigned short sg_prot_tablesize; /* * Set this if the host adapter has limitations beside segment count. */ unsigned int max_sectors; /* * Maximum size in bytes of a single segment. */ unsigned int max_segment_size; unsigned int dma_alignment; /* * DMA scatter gather segment boundary limit. A segment crossing this * boundary will be split in two. */ unsigned long dma_boundary; unsigned long virt_boundary_mask; /* * This specifies "machine infinity" for host templates which don't * limit the transfer size. Note this limit represents an absolute * maximum, and may be over the transfer limits allowed for * individual devices (e.g. 256 for SCSI-1). */ #define SCSI_DEFAULT_MAX_SECTORS 1024 /* * True if this host adapter can make good use of linked commands. * This will allow more than one command to be queued to a given * unit on a given host. Set this to the maximum number of command * blocks to be provided for each device. Set this to 1 for one * command block per lun, 2 for two, etc. Do not set this to 0. * You should make sure that the host adapter will do the right thing * before you try setting this above 1. */ short cmd_per_lun; /* * Allocate tags starting from last allocated tag. */ bool tag_alloc_policy_rr : 1; /* * Track QUEUE_FULL events and reduce queue depth on demand. */ unsigned track_queue_depth:1; /* * This specifies the mode that a LLD supports. */ unsigned supported_mode:2; /* * True for emulated SCSI host adapters (e.g. ATAPI). */ unsigned emulated:1; /* * True if the low-level driver performs its own reset-settle delays. */ unsigned skip_settle_delay:1; /* True if the controller does not support WRITE SAME */ unsigned no_write_same:1; /* True if the host uses host-wide tagspace */ unsigned host_tagset:1; /* The queuecommand callback may block. See also BLK_MQ_F_BLOCKING. */ unsigned queuecommand_may_block:1; /* * Countdown for host blocking with no commands outstanding. */ unsigned int max_host_blocked; /* * Default value for the blocking. If the queue is empty, * host_blocked counts down in the request_fn until it restarts * host operations as zero is reached. * * FIXME: This should probably be a value in the template */ #define SCSI_DEFAULT_HOST_BLOCKED 7 /* * Pointer to the SCSI host sysfs attribute groups, NULL terminated. */ const struct attribute_group **shost_groups; /* * Pointer to the SCSI device attribute groups for this host, * NULL terminated. */ const struct attribute_group **sdev_groups; /* * Vendor Identifier associated with the host * * Note: When specifying vendor_id, be sure to read the * Vendor Type and ID formatting requirements specified in * scsi_netlink.h */ u64 vendor_id; }; /* * Temporary #define for host lock push down. Can be removed when all * drivers have been updated to take advantage of unlocked * queuecommand. * */ #define DEF_SCSI_QCMD(func_name) \ int func_name(struct Scsi_Host *shost, struct scsi_cmnd *cmd) \ { \ unsigned long irq_flags; \ int rc; \ spin_lock_irqsave(shost->host_lock, irq_flags); \ rc = func_name##_lck(cmd); \ spin_unlock_irqrestore(shost->host_lock, irq_flags); \ return rc; \ } /* * shost state: If you alter this, you also need to alter scsi_sysfs.c * (for the ascii descriptions) and the state model enforcer: * scsi_host_set_state() */ enum scsi_host_state { SHOST_CREATED = 1, SHOST_RUNNING, SHOST_CANCEL, SHOST_DEL, SHOST_RECOVERY, SHOST_CANCEL_RECOVERY, SHOST_DEL_RECOVERY, }; struct Scsi_Host { /* * __devices is protected by the host_lock, but you should * usually use scsi_device_lookup / shost_for_each_device * to access it and don't care about locking yourself. * In the rare case of being in irq context you can use * their __ prefixed variants with the lock held. NEVER * access this list directly from a driver. */ struct list_head __devices; struct list_head __targets; struct list_head starved_list; spinlock_t default_lock; spinlock_t *host_lock; struct mutex scan_mutex;/* serialize scanning activity */ struct list_head eh_abort_list; struct list_head eh_cmd_q; struct task_struct * ehandler; /* Error recovery thread. */ struct completion * eh_action; /* Wait for specific actions on the host. */ wait_queue_head_t host_wait; const struct scsi_host_template *hostt; struct scsi_transport_template *transportt; struct kref tagset_refcnt; struct completion tagset_freed; /* Area to keep a shared tag map */ struct blk_mq_tag_set tag_set; atomic_t host_blocked; unsigned int host_failed; /* commands that failed. protected by host_lock */ unsigned int host_eh_scheduled; /* EH scheduled without command */ unsigned int host_no; /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */ /* next two fields are used to bound the time spent in error handling */ int eh_deadline; unsigned long last_reset; /* * These three parameters can be used to allow for wide scsi, * and for host adapters that support multiple busses * The last two should be set to 1 more than the actual max id * or lun (e.g. 8 for SCSI parallel systems). */ unsigned int max_channel; unsigned int max_id; u64 max_lun; /* * This is a unique identifier that must be assigned so that we * have some way of identifying each detected host adapter properly * and uniquely. For hosts that do not support more than one card * in the system at one time, this does not need to be set. It is * initialized to 0 in scsi_host_alloc. */ unsigned int unique_id; /* * The maximum length of SCSI commands that this host can accept. * Probably 12 for most host adapters, but could be 16 for others. * or 260 if the driver supports variable length cdbs. * For drivers that don't set this field, a value of 12 is * assumed. */ unsigned short max_cmd_len; int this_id; int can_queue; short cmd_per_lun; short unsigned int sg_tablesize; short unsigned int sg_prot_tablesize; unsigned int max_sectors; unsigned int opt_sectors; unsigned int max_segment_size; unsigned int dma_alignment; unsigned long dma_boundary; unsigned long virt_boundary_mask; /* * In scsi-mq mode, the number of hardware queues supported by the LLD. * * Note: it is assumed that each hardware queue has a queue depth of * can_queue. In other words, the total queue depth per host * is nr_hw_queues * can_queue. However, for when host_tagset is set, * the total queue depth is can_queue. */ unsigned nr_hw_queues; unsigned nr_maps; unsigned active_mode:2; /* * Host has requested that no further requests come through for the * time being. */ unsigned host_self_blocked:1; /* * Host uses correct SCSI ordering not PC ordering. The bit is * set for the minority of drivers whose authors actually read * the spec ;). */ unsigned reverse_ordering:1; /* Task mgmt function in progress */ unsigned tmf_in_progress:1; /* Asynchronous scan in progress */ unsigned async_scan:1; /* Don't resume host in EH */ unsigned eh_noresume:1; /* The controller does not support WRITE SAME */ unsigned no_write_same:1; /* True if the host uses host-wide tagspace */ unsigned host_tagset:1; /* The queuecommand callback may block. See also BLK_MQ_F_BLOCKING. */ unsigned queuecommand_may_block:1; /* Host responded with short (<36 bytes) INQUIRY result */ unsigned short_inquiry:1; /* The transport requires the LUN bits NOT to be stored in CDB[1] */ unsigned no_scsi2_lun_in_cdb:1; /* * Optional work queue to be utilized by the transport */ struct workqueue_struct *work_q; /* * Task management function work queue */ struct workqueue_struct *tmf_work_q; /* * Value host_blocked counts down from */ unsigned int max_host_blocked; /* Protection Information */ unsigned int prot_capabilities; unsigned char prot_guard_type; /* legacy crap */ unsigned long base; unsigned long io_port; unsigned char n_io_port; unsigned char dma_channel; unsigned int irq; enum scsi_host_state shost_state; /* ldm bits */ struct device shost_gendev, shost_dev; /* * Points to the transport data (if any) which is allocated * separately */ void *shost_data; /* * Points to the physical bus device we'd use to do DMA * Needed just in case we have virtual hosts. */ struct device *dma_dev; /* Delay for runtime autosuspend */ int rpm_autosuspend_delay; /* * We should ensure that this is aligned, both for better performance * and also because some compilers (m68k) don't automatically force * alignment to a long boundary. */ unsigned long hostdata[] /* Used for storage of host specific stuff */ __attribute__ ((aligned (sizeof(unsigned long)))); }; #define class_to_shost(d) \ container_of(d, struct Scsi_Host, shost_dev) #define shost_printk(prefix, shost, fmt, a...) \ dev_printk(prefix, &(shost)->shost_gendev, fmt, ##a) static inline void *shost_priv(struct Scsi_Host *shost) { return (void *)shost->hostdata; } int scsi_is_host_device(const struct device *); static inline struct Scsi_Host *dev_to_shost(struct device *dev) { while (!scsi_is_host_device(dev)) { if (!dev->parent) return NULL; dev = dev->parent; } return container_of(dev, struct Scsi_Host, shost_gendev); } static inline int scsi_host_in_recovery(struct Scsi_Host *shost) { return shost->shost_state == SHOST_RECOVERY || shost->shost_state == SHOST_CANCEL_RECOVERY || shost->shost_state == SHOST_DEL_RECOVERY || shost->tmf_in_progress; } extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *); extern void scsi_flush_work(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_alloc(const struct scsi_host_template *, int); extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *, struct device *, struct device *); #if defined(CONFIG_SCSI_PROC_FS) struct proc_dir_entry * scsi_template_proc_dir(const struct scsi_host_template *sht); #else #define scsi_template_proc_dir(sht) NULL #endif extern void scsi_scan_host(struct Scsi_Host *); extern int scsi_resume_device(struct scsi_device *sdev); extern int scsi_rescan_device(struct scsi_device *sdev); extern void scsi_remove_host(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_get(struct Scsi_Host *); extern int scsi_host_busy(struct Scsi_Host *shost); extern void scsi_host_put(struct Scsi_Host *t); extern struct Scsi_Host *scsi_host_lookup(unsigned int hostnum); extern const char *scsi_host_state_name(enum scsi_host_state); extern void scsi_host_complete_all_commands(struct Scsi_Host *shost, enum scsi_host_status status); static inline int __must_check scsi_add_host(struct Scsi_Host *host, struct device *dev) { return scsi_add_host_with_dma(host, dev, dev); } static inline struct device *scsi_get_device(struct Scsi_Host *shost) { return shost->shost_gendev.parent; } /** * scsi_host_scan_allowed - Is scanning of this host allowed * @shost: Pointer to Scsi_Host. **/ static inline int scsi_host_scan_allowed(struct Scsi_Host *shost) { return shost->shost_state == SHOST_RUNNING || shost->shost_state == SHOST_RECOVERY; } extern void scsi_unblock_requests(struct Scsi_Host *); extern void scsi_block_requests(struct Scsi_Host *); extern int scsi_host_block(struct Scsi_Host *shost); extern int scsi_host_unblock(struct Scsi_Host *shost, int new_state); void scsi_host_busy_iter(struct Scsi_Host *, bool (*fn)(struct scsi_cmnd *, void *), void *priv); struct class_container; /* * DIF defines the exchange of protection information between * initiator and SBC block device. * * DIX defines the exchange of protection information between OS and * initiator. */ enum scsi_host_prot_capabilities { SHOST_DIF_TYPE1_PROTECTION = 1 << 0, /* T10 DIF Type 1 */ SHOST_DIF_TYPE2_PROTECTION = 1 << 1, /* T10 DIF Type 2 */ SHOST_DIF_TYPE3_PROTECTION = 1 << 2, /* T10 DIF Type 3 */ SHOST_DIX_TYPE0_PROTECTION = 1 << 3, /* DIX between OS and HBA only */ SHOST_DIX_TYPE1_PROTECTION = 1 << 4, /* DIX with DIF Type 1 */ SHOST_DIX_TYPE2_PROTECTION = 1 << 5, /* DIX with DIF Type 2 */ SHOST_DIX_TYPE3_PROTECTION = 1 << 6, /* DIX with DIF Type 3 */ }; /* * SCSI hosts which support the Data Integrity Extensions must * indicate their capabilities by setting the prot_capabilities using * this call. */ static inline void scsi_host_set_prot(struct Scsi_Host *shost, unsigned int mask) { shost->prot_capabilities = mask; } static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost) { return shost->prot_capabilities; } static inline int scsi_host_prot_dma(struct Scsi_Host *shost) { return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION; } static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type) { static unsigned char cap[] = { 0, SHOST_DIF_TYPE1_PROTECTION, SHOST_DIF_TYPE2_PROTECTION, SHOST_DIF_TYPE3_PROTECTION }; if (target_type >= ARRAY_SIZE(cap)) return 0; return shost->prot_capabilities & cap[target_type] ? target_type : 0; } static inline unsigned int scsi_host_dix_capable(struct Scsi_Host *shost, unsigned int target_type) { #if defined(CONFIG_BLK_DEV_INTEGRITY) static unsigned char cap[] = { SHOST_DIX_TYPE0_PROTECTION, SHOST_DIX_TYPE1_PROTECTION, SHOST_DIX_TYPE2_PROTECTION, SHOST_DIX_TYPE3_PROTECTION }; if (target_type >= ARRAY_SIZE(cap)) return 0; return shost->prot_capabilities & cap[target_type]; #endif return 0; } /* * All DIX-capable initiators must support the T10-mandated CRC * checksum. Controllers can optionally implement the IP checksum * scheme which has much lower impact on system performance. Note * that the main rationale for the checksum is to match integrity * metadata with data. Detecting bit errors are a job for ECC memory * and buses. */ enum scsi_host_guard_type { SHOST_DIX_GUARD_CRC = 1 << 0, SHOST_DIX_GUARD_IP = 1 << 1, }; static inline void scsi_host_set_guard(struct Scsi_Host *shost, unsigned char type) { shost->prot_guard_type = type; } static inline unsigned char scsi_host_get_guard(struct Scsi_Host *shost) { return shost->prot_guard_type; } extern int scsi_host_set_state(struct Scsi_Host *, enum scsi_host_state); #endif /* _SCSI_SCSI_HOST_H */ |
| 658 78 78 658 658 658 148 149 139 101 533 617 1 658 658 658 586 3 94 94 94 94 94 604 604 604 604 14 604 604 604 2 2 603 604 603 1 160 161 161 74 160 161 160 78 77 78 3 3 3 3 3 3 78 78 3 17 17 13 17 5 5 4 1 3 3 3 3 3 3 3 3 4 5 2 2 1 1 1 2 2 2 13 1 13 5 4 3 2 9 8 8 8 8 6 8 6 1 1 1 1 3 3 3 9 13 5 5 5 5 2 3 3 3 3 1 5 1 1 1 1 1 77 77 77 76 77 77 76 73 76 72 4 4 75 1 1 1 1 1 1 77 52 23 8 3 1 1 1 1 3 901 903 900 723 604 2 604 419 723 1 352 16 17 3 17 28 125 1 1 16 161 8 8 897 83 83 82 83 83 83 83 83 83 83 3 83 83 83 83 83 83 82 2 1 1 1 1 1 1 1 3 2 2 2 2 2 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 1 81 3 81 81 81 882 880 878 937 935 900 900 936 747 201 199 190 190 1 191 757 757 755 756 640 631 12 629 12 635 12 629 12 630 12 629 757 4 750 749 749 1 5 5 5 5 5 5 4 3 1 2 1 1 2 2 2 2 1 2 5 5 5 5 5 5 3 3 3 3 3 3 5 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 14 10 1 10 1 9 1 7 1 1 5 1 1 4 14 5 2 5 5 2 2 1 2 1 1 1 1 2 3 1 3 3 621 622 622 622 622 3 3 190 188 612 612 612 612 612 2 8 169 429 429 429 429 429 429 429 429 429 429 429 429 429 2 3 4 4 4 429 20 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 IP device support routines. * * Derived from the IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: * Alexey Kuznetsov: pa_* fields are replaced with ifaddr * lists. * Cyrus Durgin: updated for kmod * Matthias Andree: in devinet_ioctl, compare label and * address (4.4BSD alias style support), * fall back to comparing just the label * if no match found. */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include "igmp_internal.h" #include <linux/slab.h> #include <linux/hash.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/kmod.h> #include <linux/netconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/addrconf.h> #define IPV6ONLY_FLAGS \ (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \ IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \ IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY) static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; static struct ipv4_devconf ipv4_devconf_dflt = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; #define IPV4_DEVCONF_DFLT(net, attr) \ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .type = NLA_U32 }, [IFA_RT_PRIORITY] = { .type = NLA_U32 }, [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFA_PROTO] = { .type = NLA_U8 }, }; #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) static u32 inet_addr_hash(const struct net *net, __be32 addr) { u32 val = __ipv4_addr_hash(addr, net_hash_mix(net)); return hash_32(val, IN4_ADDR_HSIZE_SHIFT); } static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { u32 hash = inet_addr_hash(net, ifa->ifa_local); ASSERT_RTNL(); hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); } static void inet_hash_remove(struct in_ifaddr *ifa) { ASSERT_RTNL(); hlist_del_init_rcu(&ifa->addr_lst); } /** * __ip_dev_find - find the first device with a given source address. * @net: the net namespace * @addr: the source address * @devref: if true, take a reference on the found device * * If a caller uses devref=false, it should be protected by RCU, or RTNL */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); ifa = inet_lookup_ifaddr_rcu(net, addr); if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; /* Fallback to FIB local table so that communication * over loopback subnets work. */ local = fib_get_table(net, RT_TABLE_LOCAL); if (local && !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); } else { result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL(__ip_dev_find); /* called under RCU lock */ struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) { u32 hash = inet_addr_hash(net, addr); struct in_ifaddr *ifa; hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst) if (ifa->ifa_local == addr) return ifa; return NULL; } static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); static void devinet_sysctl_unregister(struct in_device *idev); #else static int devinet_sysctl_register(struct in_device *idev) { return 0; } static void devinet_sysctl_unregister(struct in_device *idev) { } #endif /* Locks all the inet devices. */ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) { struct in_ifaddr *ifa; ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT); if (!ifa) return NULL; in_dev_hold(in_dev); ifa->ifa_dev = in_dev; INIT_HLIST_NODE(&ifa->addr_lst); return ifa; } static void inet_rcu_free_ifa(struct rcu_head *head) { struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); in_dev_put(ifa->ifa_dev); kfree(ifa); } static void inet_free_ifa(struct in_ifaddr *ifa) { /* Our reference to ifa->ifa_dev must be freed ASAP * to release the reference to the netdev the same way. * in_dev_put() -> in_dev_finish_destroy() -> netdev_put() */ call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa); } static void in_dev_free_rcu(struct rcu_head *head) { struct in_device *idev = container_of(head, struct in_device, rcu_head); kfree(rcu_dereference_protected(idev->mc_hash, 1)); kfree(idev); } void in_dev_finish_destroy(struct in_device *idev) { struct net_device *dev = idev->dev; WARN_ON(idev->ifa_list); WARN_ON(idev->mc_list); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); #endif netdev_put(dev, &idev->dev_tracker); if (!idev->dead) pr_err("Freeing alive in_device %p\n", idev); else call_rcu(&idev->rcu_head, in_dev_free_rcu); } EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; int err = -ENOMEM; ASSERT_RTNL(); in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL); if (!in_dev) goto out; memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt, sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl); if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) netif_disable_lro(dev); /* Reference in_dev->dev */ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL); /* Account for reference dev->ip_ptr (below) */ refcount_set(&in_dev->refcnt, 1); if (dev != blackhole_netdev) { err = devinet_sysctl_register(in_dev); if (err) { in_dev->dead = 1; neigh_parms_release(&arp_tbl, in_dev->arp_parms); in_dev_put(in_dev); in_dev = NULL; goto out; } ip_mc_init_dev(in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); } /* we can receive as soon as ip_ptr is set -- do this last */ rcu_assign_pointer(dev->ip_ptr, in_dev); out: return in_dev ?: ERR_PTR(err); out_kfree: kfree(in_dev); in_dev = NULL; goto out; } static void inetdev_destroy(struct in_device *in_dev) { struct net_device *dev; struct in_ifaddr *ifa; ASSERT_RTNL(); dev = in_dev->dev; in_dev->dead = 1; ip_mc_destroy_dev(in_dev); while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } RCU_INIT_POINTER(dev->ip_ptr, NULL); devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); arp_ifdown(dev); in_dev_put(in_dev); } static int __init inet_blackhole_dev_init(void) { struct in_device *in_dev; rtnl_lock(); in_dev = inetdev_init(blackhole_netdev); rtnl_unlock(); return PTR_ERR_OR_ZERO(in_dev); } late_initcall(inet_blackhole_dev_init); int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { const struct in_ifaddr *ifa; rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { rcu_read_unlock(); return 1; } } } rcu_read_unlock(); return 0; } static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa, *ifa1; struct in_ifaddr __rcu **last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); ifa1 = rtnl_dereference(*ifap); last_prim = ifap; if (in_dev->dead) goto no_promotions; /* 1. Deleting primary ifaddr forces deletion all secondaries * unless alias promotion is set **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = &ifa->ifa_next; if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { ifap1 = &ifa->ifa_next; prev_prom = ifa; continue; } if (!do_promote) { inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); inet_free_ifa(ifa); } else { promote = ifa; break; } } } /* On promotion all secondaries from subnet are changing * the primary IP, we must remove all their routes silently * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); } no_promotions: /* 2. Unlink it */ *ifap = ifa1->ifa_next; inet_hash_remove(ifa1); /* 3. Announce address deletion */ /* Send message first, then call notifier. At first sight, FIB update triggered by notifier will refer to already deleted ifaddr, that could confuse netlink listeners. It is not true: look, gated sees that route deleted and if it still thinks that ifaddr is valid, it will try to restore deleted routes... Grr. So that, this order is correct. */ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { struct in_ifaddr *next_sec; next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { struct in_ifaddr *last_sec; rcu_assign_pointer(prev_prom->ifa_next, next_sec); last_sec = rtnl_dereference(*last_prim); rcu_assign_pointer(promote->ifa_next, last_sec); rcu_assign_pointer(*last_prim, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); for (ifa = next_sec; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; fib_add_ifaddr(ifa); } } if (destroy) inet_free_ifa(ifa1); } static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); } static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; struct net *net = dev_net(in_dev->dev); struct in_validator_info ivi; struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; /* Don't set IPv6 only flags to IPv4 addresses */ ifa->ifa_flags &= ~IPV6ONLY_FLAGS; ifap = &in_dev->ifa_list; ifa1 = rtnl_dereference(*ifap); while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) { if (ifa1->ifa_local == ifa->ifa_local) { inet_free_ifa(ifa); return -EEXIST; } if (ifa1->ifa_scope != ifa->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value"); inet_free_ifa(ifa); return -EINVAL; } ifa->ifa_flags |= IFA_F_SECONDARY; } ifap = &ifa1->ifa_next; ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh * in now, before changes are committed. The rntl lock is serializing * access here, so the state should not change between a validator call * and a final notify on commit. This isn't invoked on promotion under * the assumption that validators are checking the address itself, and * not the flags. */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); if (ret) { inet_free_ifa(ifa); return ret; } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) ifap = last_primary; rcu_assign_pointer(ifa->ifa_next, *ifap); rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); return 0; } static int inet_insert_ifa(struct in_ifaddr *ifa) { if (!ifa->ifa_local) { inet_free_ifa(ifa); return 0; } return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (ipv4_is_loopback(ifa->ifa_local)) ifa->ifa_scope = RT_SCOPE_HOST; return inet_insert_ifa(ifa); } /* Caller must hold RCU or RTNL : * We dont take a reference on found in_device */ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) in_dev = rcu_dereference_rtnl(dev->ip_ptr); rcu_read_unlock(); return in_dev; } EXPORT_SYMBOL(inetdev_by_index); /* Called only from RTNL semaphored context. No locks. */ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask) { struct in_ifaddr *ifa; ASSERT_RTNL(); in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) return ifa; } return NULL; } static int ip_mc_autojoin_config(struct net *net, bool join, const struct in_ifaddr *ifa) { #if defined(CONFIG_IP_MULTICAST) struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ifa->ifa_address, .imr_ifindex = ifa->ifa_dev->dev->ifindex, }; struct sock *sk = net->ipv4.mc_autojoin_sk; int ret; ASSERT_RTNL_NET(net); lock_sock(sk); if (join) ret = ip_mc_join_group(sk, &mreq); else ret = ip_mc_leave_group(sk, &mreq); release_sock(sk); return ret; #else return -EOPNOTSUPP; #endif } static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; struct in_ifaddr *ifa; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) goto out; ifm = nlmsg_data(nlh); rtnl_net_lock(net); in_dev = inetdev_by_index(net, ifm->ifa_index); if (!in_dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); err = -ENODEV; goto unlock; } for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) continue; if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) continue; if (tb[IFA_ADDRESS] && (ifm->ifa_prefixlen != ifa->ifa_prefixlen || !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa))) continue; if (ipv4_is_multicast(ifa->ifa_address)) ip_mc_autojoin_config(net, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); goto unlock; } NL_SET_ERR_MSG(extack, "ipv4: Address not found"); err = -EADDRNOTAVAIL; unlock: rtnl_net_unlock(net); out: return err; } static void check_lifetime(struct work_struct *work) { unsigned long now, next, next_sec, next_sched; struct in_ifaddr *ifa; struct hlist_node *n; struct net *net; int i; net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work); now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); for (i = 0; i < IN4_ADDR_HSIZE; i++) { struct hlist_head *head = &net->ipv4.inet_addr_lst[i]; bool change_needed = false; rcu_read_lock(); hlist_for_each_entry_rcu(ifa, head, addr_lst) { unsigned long age, tstamp; u32 preferred_lft; u32 valid_lft; u32 flags; flags = READ_ONCE(ifa->ifa_flags); if (flags & IFA_F_PERMANENT) continue; preferred_lft = READ_ONCE(ifa->ifa_preferred_lft); valid_lft = READ_ONCE(ifa->ifa_valid_lft); tstamp = READ_ONCE(ifa->ifa_tstamp); /* We try to batch several events at once. */ age = (now - tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (valid_lft != INFINITY_LIFE_TIME && age >= valid_lft) { change_needed = true; } else if (preferred_lft == INFINITY_LIFE_TIME) { continue; } else if (age >= preferred_lft) { if (time_before(tstamp + valid_lft * HZ, next)) next = tstamp + valid_lft * HZ; if (!(flags & IFA_F_DEPRECATED)) change_needed = true; } else if (time_before(tstamp + preferred_lft * HZ, next)) { next = tstamp + preferred_lft * HZ; } } rcu_read_unlock(); if (!change_needed) continue; rtnl_net_lock(net); hlist_for_each_entry_safe(ifa, n, head, addr_lst) { unsigned long age; if (ifa->ifa_flags & IFA_F_PERMANENT) continue; /* We try to batch several events at once. */ age = (now - ifa->ifa_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { struct in_ifaddr __rcu **ifap; struct in_ifaddr *tmp; ifap = &ifa->ifa_dev->ifa_list; tmp = rtnl_net_dereference(net, *ifap); while (tmp) { if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } ifap = &tmp->ifa_next; tmp = rtnl_net_dereference(net, *ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_preferred_lft && !(ifa->ifa_flags & IFA_F_DEPRECATED)) { ifa->ifa_flags |= IFA_F_DEPRECATED; rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } rtnl_net_unlock(net); } next_sec = round_jiffies_up(next); next_sched = next; /* If rounded timeout is accurate enough, accept it. */ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) next_sched = next_sec; now = jiffies; /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, __u32 prefered_lft) { unsigned long timeout; u32 flags; flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); timeout = addrconf_timeout_fixup(valid_lft, HZ); if (addrconf_finite_timeout(timeout)) WRITE_ONCE(ifa->ifa_valid_lft, timeout); else flags |= IFA_F_PERMANENT; timeout = addrconf_timeout_fixup(prefered_lft, HZ); if (addrconf_finite_timeout(timeout)) { if (timeout == 0) flags |= IFA_F_DEPRECATED; WRITE_ONCE(ifa->ifa_preferred_lft, timeout); } WRITE_ONCE(ifa->ifa_flags, flags); WRITE_ONCE(ifa->ifa_tstamp, jiffies); if (!ifa->ifa_cstamp) WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp); } static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack, __u32 *valid_lft, __u32 *prefered_lft) { struct ifaddrmsg *ifm = nlmsg_data(nlh); int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; if (ifm->ifa_prefixlen > 32) { NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length"); return -EINVAL; } if (!tb[IFA_LOCAL]) { NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied"); return -EINVAL; } if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; ci = nla_data(tb[IFA_CACHEINFO]); if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); return -EINVAL; } *valid_lft = ci->ifa_valid; *prefered_lft = ci->ifa_prefered; } return 0; } static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifaddrmsg *ifm = nlmsg_data(nlh); struct in_device *in_dev; struct net_device *dev; struct in_ifaddr *ifa; int err; dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; if (!dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); goto errout; } in_dev = __in_dev_get_rtnl_net(dev); err = -ENOBUFS; if (!in_dev) goto errout; ifa = inet_alloc_ifa(in_dev); if (!ifa) /* * A potential indev allocation can be left alive, it stays * assigned to its device and is destroy with it. */ goto errout; ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (!tb[IFA_ADDRESS]) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); if (tb[IFA_BROADCAST]) ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (tb[IFA_RT_PRIORITY]) ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); if (tb[IFA_PROTO]) ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]); return ifa; errout: return ERR_PTR(err); } static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1; in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) return ifa1; } return NULL; } static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { __u32 prefered_lft = INFINITY_LIFE_TIME; __u32 valid_lft = INFINITY_LIFE_TIME; struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa_existing; struct nlattr *tb[IFA_MAX + 1]; struct in_ifaddr *ifa; int ret; ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft); if (ret < 0) return ret; if (!nla_get_in_addr(tb[IFA_LOCAL])) return 0; rtnl_net_lock(net); ifa = inet_rtm_to_ifa(net, nlh, tb, extack); if (IS_ERR(ifa)) { ret = PTR_ERR(ifa); goto unlock; } ifa_existing = find_matching_ifa(net, ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); goto unlock; } } ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack); } else { u32 new_metric = ifa->ifa_rt_priority; u8 new_proto = ifa->ifa_proto; inet_free_ifa(ifa); if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "ipv4: Address already assigned"); ret = -EEXIST; goto unlock; } ifa = ifa_existing; if (ifa->ifa_rt_priority != new_metric) { fib_modify_prefix_metric(ifa, new_metric); ifa->ifa_rt_priority = new_metric; } ifa->ifa_proto = new_proto; set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); } unlock: rtnl_net_unlock(net); return ret; } /* * Determine a default network mask, based on the IP address. */ static int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) rc = 0; else { __u32 haddr = ntohl(addr); if (IN_CLASSA(haddr)) rc = 8; else if (IN_CLASSB(haddr)) rc = 16; else if (IN_CLASSC(haddr)) rc = 24; else if (IN_CLASSE(haddr)) rc = 32; } return rc; } int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; int ret = -EFAULT; int tryaddrmatch = 0; ifr->ifr_name[IFNAMSIZ - 1] = 0; /* save original address for comparison */ memcpy(&sin_orig, sin, sizeof(*sin)); colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; dev_load(net, ifr->ifr_name); switch (cmd) { case SIOCGIFADDR: /* Get interface address */ case SIOCGIFBRDADDR: /* Get the broadcast address */ case SIOCGIFDSTADDR: /* Get the destination address */ case SIOCGIFNETMASK: /* Get the netmask for the interface */ /* Note that these ioctls will not sleep, so that we do not impose a lock. One day we will be forced to put shlock here (I mean SMP) */ tryaddrmatch = (sin_orig.sin_family == AF_INET); memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; break; case SIOCSIFFLAGS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) goto out; break; default: ret = -EINVAL; goto out; } rtnl_net_lock(net); ret = -ENODEV; dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) goto done; if (colon) *colon = ':'; in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { if (tryaddrmatch) { /* Matthias Andree */ /* compare label and address (4.4BSD style) */ /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == ifa->ifa_local) { break; /* found */ } } } /* we didn't get a match, maybe the application is 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; } } ret = -EADDRNOTAVAIL; if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; switch (cmd) { case SIOCGIFADDR: /* Get interface address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_local; break; case SIOCGIFBRDADDR: /* Get the broadcast address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_broadcast; break; case SIOCGIFDSTADDR: /* Get the destination address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_address; break; case SIOCGIFNETMASK: /* Get the netmask for the interface */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_mask; break; case SIOCSIFFLAGS: if (colon) { ret = -EADDRNOTAVAIL; if (!ifa) break; ret = 0; if (!(ifr->ifr_flags & IFF_UP)) inet_del_ifa(in_dev, ifap, 1); break; } /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */ ASSERT_RTNL(); ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; if (!ifa) { ret = -ENOBUFS; if (!in_dev) break; ifa = inet_alloc_ifa(in_dev); if (!ifa) break; if (colon) memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { ret = 0; if (ifa->ifa_local == sin->sin_addr.s_addr) break; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; ifa->ifa_scope = 0; } ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; if (!(dev->flags & IFF_POINTOPOINT)) { ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); if ((dev->flags & IFF_BROADCAST) && ifa->ifa_prefixlen < 31) ifa->ifa_broadcast = ifa->ifa_address | ~ifa->ifa_mask; } else { ifa->ifa_prefixlen = 32; ifa->ifa_mask = inet_make_mask(32); } set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ret = inet_set_ifa(dev, ifa); break; case SIOCSIFBRDADDR: /* Set the broadcast address */ ret = 0; if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = sin->sin_addr.s_addr; inet_insert_ifa(ifa); } break; case SIOCSIFDSTADDR: /* Set the destination address */ ret = 0; if (ifa->ifa_address == sin->sin_addr.s_addr) break; ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; ret = 0; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_address = sin->sin_addr.s_addr; inet_insert_ifa(ifa); break; case SIOCSIFNETMASK: /* Set the netmask for the interface */ /* * The mask we set must be legal. */ ret = -EINVAL; if (bad_mask(sin->sin_addr.s_addr, 0)) break; ret = 0; if (ifa->ifa_mask != sin->sin_addr.s_addr) { __be32 old_mask = ifa->ifa_mask; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_mask = sin->sin_addr.s_addr; ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); /* See if current broadcast address matches * with current netmask, then recalculate * the broadcast address. Otherwise it's a * funny address, so don't touch it since * the user seems to know what (s)he's doing... */ if ((dev->flags & IFF_BROADCAST) && (ifa->ifa_prefixlen < 31) && (ifa->ifa_broadcast == (ifa->ifa_local|~old_mask))) { ifa->ifa_broadcast = (ifa->ifa_local | ~sin->sin_addr.s_addr); } inet_insert_ifa(ifa); } break; } done: rtnl_net_unlock(net); out: return ret; } int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; if (WARN_ON(size > sizeof(struct ifreq))) goto out; if (!in_dev) goto out; in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) { if (!buf) { done += size; continue; } if (len < size) break; memset(&ifr, 0, sizeof(struct ifreq)); strcpy(ifr.ifr_name, ifa->ifa_label); (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; if (copy_to_user(buf + done, &ifr, size)) { done = -EFAULT; break; } len -= size; done += size; } out: return done; } static __be32 in_dev_select_addr(const struct in_device *in_dev, int scope) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) return ifa->ifa_local; } return 0; } __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { const struct in_ifaddr *ifa; __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; struct net *net; int master_idx; rcu_read_lock(); net = dev_net_rcu(dev); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (min(ifa->ifa_scope, localnet_scope) > scope) continue; if (!dst || inet_ifa_match(dst, ifa)) { addr = ifa->ifa_local; break; } if (!addr) addr = ifa->ifa_local; } if (addr) goto out_unlock; no_in_dev: master_idx = l3mdev_master_ifindex_rcu(dev); /* For VRFs, the VRF device takes the place of the loopback device, * with addresses on it being preferred. Note in such cases the * loopback device will be among the devices that fail the master_idx * equality check in the loop below. */ if (master_idx && (dev = dev_get_by_index_rcu(net, master_idx)) && (in_dev = __in_dev_get_rcu(dev))) { addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } /* Not loopback addresses on loopback should be preferred in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { if (l3mdev_master_ifindex_rcu(dev) != master_idx) continue; in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } out_unlock: rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) { unsigned char localnet_scope = RT_SCOPE_HOST; const struct in_ifaddr *ifa; __be32 addr = 0; int same = 0; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { unsigned char min_scope = min(ifa->ifa_scope, localnet_scope); if (!addr && (local == ifa->ifa_local || !local) && min_scope <= scope) { addr = ifa->ifa_local; if (same) break; } if (!same) { same = (!local || inet_ifa_match(local, ifa)) && (!dst || inet_ifa_match(dst, ifa)); if (same && addr) { if (local || !dst) break; /* Is the selected addr into dst subnet? */ if (inet_ifa_match(addr, ifa)) break; /* No, then can we use new local src? */ if (min_scope <= scope) { addr = ifa->ifa_local; break; } /* search for large dst subnet for addr */ same = 0; } } } return same ? addr : 0; } /* * Confirm that local IP address exists using wildcards: * - net: netns to check, cannot be NULL * - in_dev: only on this interface, NULL=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope) { __be32 addr = 0; struct net_device *dev; if (in_dev) return confirm_addr_indev(in_dev, dst, local, scope); rcu_read_lock(); for_each_netdev_rcu(net, dev) { in_dev = __in_dev_get_rcu(dev); if (in_dev) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) break; } } rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_confirm_addr); /* * Device notifier */ int register_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_chain, nb); } EXPORT_SYMBOL(register_inetaddr_notifier); int unregister_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_notifier); int register_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(register_inetaddr_validator_notifier); int unregister_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_validator_notifier); /* Rename ifa_labels for a device name change. Make some effort to preserve * existing alias numbering and to create unique labels if possible. */ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) { struct in_ifaddr *ifa; int named = 0; in_dev_for_each_ifa_rtnl(ifa, in_dev) { char old[IFNAMSIZ], *dot; memcpy(old, ifa->ifa_label, IFNAMSIZ); memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (named++ == 0) goto skip; dot = strchr(old, ':'); if (!dot) { sprintf(old, ":%d", named); dot = old; } if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) strcat(ifa->ifa_label, dot); else strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); skip: rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rtnl(ifa, in_dev) { arp_send(ARPOP_REQUEST, ETH_P_ARP, ifa->ifa_local, dev, ifa->ifa_local, NULL, dev->dev_addr, NULL); } } /* Called only under RTNL semaphore */ static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); if (!in_dev) { if (event == NETDEV_REGISTER) { in_dev = inetdev_init(dev); if (IS_ERR(in_dev)) return notifier_from_errno(PTR_ERR(in_dev)); if (dev->flags & IFF_LOOPBACK) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } } else if (event == NETDEV_CHANGEMTU) { /* Re-enabling IP */ if (inetdev_valid_mtu(dev->mtu)) in_dev = inetdev_init(dev); } goto out; } switch (event) { case NETDEV_REGISTER: pr_debug("%s: bug\n", __func__); RCU_INIT_POINTER(dev->ip_ptr, NULL); break; case NETDEV_UP: if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { struct in_ifaddr *ifa = inet_alloc_ifa(in_dev); if (ifa) { ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; ifa->ifa_mask = inet_make_mask(8); ifa->ifa_scope = RT_SCOPE_HOST; memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); inet_insert_ifa(ifa); } } ip_mc_up(in_dev); fallthrough; case NETDEV_CHANGEADDR: if (!IN_DEV_ARP_NOTIFY(in_dev)) break; fallthrough; case NETDEV_NOTIFY_PEERS: /* Send gratuitous ARP to notify of link change */ inetdev_send_gratuitous_arp(dev, in_dev); break; case NETDEV_DOWN: ip_mc_down(in_dev); break; case NETDEV_PRE_TYPE_CHANGE: ip_mc_unmap(in_dev); break; case NETDEV_POST_TYPE_CHANGE: ip_mc_remap(in_dev); break; case NETDEV_CHANGEMTU: if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ fallthrough; case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; case NETDEV_CHANGENAME: /* Do not notify about label change, this event is * not interesting to applications using netlink. */ inetdev_changename(dev, in_dev); devinet_sysctl_unregister(in_dev); devinet_sysctl_register(in_dev); break; } out: return NOTIFY_DONE; } static struct notifier_block ip_netdev_notifier = { .notifier_call = inetdev_event, }; static size_t inet_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + nla_total_size(4) /* IFA_FLAGS */ + nla_total_size(1) /* IFA_PROTO */ + nla_total_size(4) /* IFA_RT_PRIORITY */ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } static inline u32 cstamp_delta(unsigned long cstamp) { return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, unsigned long tstamp, u32 preferred, u32 valid) { struct ifa_cacheinfo ci; ci.cstamp = cstamp_delta(cstamp); ci.tstamp = cstamp_delta(tstamp); ci.ifa_prefered = preferred; ci.ifa_valid = valid; return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); } static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa, struct inet_fill_args *args) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; unsigned long tstamp; u32 preferred, valid; u32 flags; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm), args->flags); if (!nlh) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; flags = READ_ONCE(ifa->ifa_flags); /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits. * The 32bit value is given in IFA_FLAGS attribute. */ ifm->ifa_flags = (__u8)flags; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; if (args->netnsid >= 0 && nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto nla_put_failure; tstamp = READ_ONCE(ifa->ifa_tstamp); if (!(flags & IFA_F_PERMANENT)) { preferred = READ_ONCE(ifa->ifa_preferred_lft); valid = READ_ONCE(ifa->ifa_valid_lft); if (preferred != INFINITY_LIFE_TIME) { long tval = (jiffies - tstamp) / HZ; if (preferred > tval) preferred -= tval; else preferred = 0; if (valid != INFINITY_LIFE_TIME) { if (valid > tval) valid -= tval; else valid = 0; } } } else { preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } if ((ifa->ifa_address && nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) || (ifa->ifa_local && nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) || (ifa->ifa_broadcast && nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || (ifa->ifa_proto && nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) || nla_put_u32(skb, IFA_FLAGS, flags) || (ifa->ifa_rt_priority && nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) || put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp, preferred, valid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct inet_fill_args *fillargs, struct net **tgt_net, struct sock *sk, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[IFA_MAX+1]; struct ifaddrmsg *ifm; int err, i; ifm = nlmsg_payload(nlh, sizeof(*ifm)); if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); return -EINVAL; } if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); return -EINVAL; } fillargs->ifindex = ifm->ifa_index; if (fillargs->ifindex) { cb->answer_flags |= NLM_F_DUMP_FILTERED; fillargs->flags |= NLM_F_DUMP_FILTERED; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= IFA_MAX; ++i) { if (!tb[i]) continue; if (i == IFA_TARGET_NETNSID) { struct net *net; fillargs->netnsid = nla_get_s32(tb[i]); net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); if (IS_ERR(net)) { fillargs->netnsid = -1; NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); return PTR_ERR(net); } *tgt_net = net; } else { NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request"); return -EINVAL; } } return 0; } static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { struct ip_mc_list *im; int ip_idx = 0; int err; for (im = rcu_dereference(in_dev->mc_list); im; im = rcu_dereference(im->next_rcu)) { if (ip_idx < *s_ip_idx) { ip_idx++; continue; } err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); ip_idx++; } err = 0; ip_idx = 0; done: *s_ip_idx = ip_idx; return err; } static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { struct in_ifaddr *ifa; int ip_idx = 0; int err; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (ip_idx < *s_ip_idx) { ip_idx++; continue; } err = inet_fill_ifaddr(skb, ifa, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); ip_idx++; } err = 0; ip_idx = 0; done: *s_ip_idx = ip_idx; return err; } static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { switch (fillargs->event) { case RTM_NEWADDR: return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs); case RTM_GETMULTICAST: return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx, fillargs); default: return -EINVAL; } } /* Combine dev_addr_genid and dev_base_seq to detect changes. */ static u32 inet_base_seq(const struct net *net) { u32 res = atomic_read(&net->ipv4.dev_addr_genid) + READ_ONCE(net->dev_base_seq); /* Must not return 0 (see nl_dump_check_consistent()). * Chose a value far away from 0. */ if (!res) res = 0x80000000; return res; } static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, int event) { const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = nlh->nlmsg_seq, .event = event, .flags = NLM_F_MULTI, .netnsid = -1, }; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct { unsigned long ifindex; int ip_idx; } *ctx = (void *)cb->ctx; struct in_device *in_dev; struct net_device *dev; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) goto done; if (fillargs.ifindex) { dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); if (!dev) { err = -ENODEV; goto done; } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto done; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); goto done; } } cb->seq = inet_base_seq(tgt_net); for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); if (err < 0) goto done; } done: if (fillargs.netnsid >= 0) put_net(tgt_net); rcu_read_unlock(); return err; } static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { return inet_dump_addr(skb, cb, RTM_NEWADDR); } static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) { return inet_dump_addr(skb, cb, RTM_GETMULTICAST); } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { struct inet_fill_args fillargs = { .portid = portid, .seq = nlh ? nlh->nlmsg_seq : 0, .event = event, .flags = 0, .netnsid = -1, }; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; net = dev_net(ifa->ifa_dev->dev); skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL); if (!skb) goto errout; err = inet_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } static size_t inet_get_link_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (!in_dev) return 0; return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ } static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; int i; if (!in_dev) return -ENODATA; nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4); if (!nla) return -EMSGSIZE; for (i = 0; i < IPV4_DEVCONF_MAX; i++) ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]); return 0; } static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = { [IFLA_INET_CONF] = { .type = NLA_NESTED }, }; static int inet_validate_link_af(const struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, inet_af_policy, extack); if (err < 0) return err; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) { int cfgid = nla_type(a); if (nla_len(a) < 4) return -EINVAL; if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX) return -EINVAL; } } return 0; } static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct in_device *in_dev = __in_dev_get_rtnl(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; if (!in_dev) return -EAFNOSUPPORT; if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) return -EINVAL; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a)); } return 0; } static int inet_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ bool all = false; if (type == NETCONFA_ALL) all = true; if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_RP_FILTER) size += nla_total_size(4); if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_BC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; } static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, const struct ipv4_devconf *devconf, u32 portid, u32 seq, int event, unsigned int flags, int type) { struct nlmsghdr *nlh; struct netconfmsg *ncm; bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; if (type == NETCONFA_ALL) all = true; ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; if (!devconf) goto out; if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_RP_FILTER) && nla_put_s32(skb, NETCONFA_RP_FILTER, IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_BC_FORWARDING) && nla_put_s32(skb, NETCONFA_BC_FORWARDING, IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF_RO(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void inet_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv4_devconf *devconf) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); } static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet_netconf_valid_get_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); if (err) return err; for (i = 0; i <= NETCONFA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETCONFA_IFINDEX: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request"); return -EINVAL; } } return 0; } static int inet_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; const struct ipv4_devconf *devconf; struct in_device *in_dev = NULL; struct net_device *dev = NULL; struct sk_buff *skb; int ifindex; int err; err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err) return err; if (!tb[NETCONFA_IFINDEX]) return -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { case NETCONFA_IFINDEX_ALL: devconf = net->ipv4.devconf_all; break; case NETCONFA_IFINDEX_DEFAULT: devconf = net->ipv4.devconf_dflt; break; default: err = -ENODEV; dev = dev_get_by_index(net, ifindex); if (dev) in_dev = in_dev_get(dev); if (!in_dev) goto errout; devconf = &in_dev->cnf; break; } err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: if (in_dev) in_dev_put(in_dev); dev_put(dev); return err; } static int inet_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct { unsigned long ifindex; unsigned int all_default; } *ctx = (void *)cb->ctx; const struct in_device *in_dev; struct net_device *dev; int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; struct netconfmsg *ncm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request"); return -EINVAL; } } rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = inet_netconf_fill_devconf(skb, dev->ifindex, &in_dev->cnf, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; } if (ctx->all_default == 0) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } if (ctx->all_default == 1) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } done: rcu_read_unlock(); return err; } #ifdef CONFIG_SYSCTL static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct in_device *in_dev; in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; } rcu_read_unlock(); } /* called with RTNL locked */ static void inet_forward_change(struct net *net) { struct net_device *dev; int on = IPV4_DEVCONF_ALL(net, FORWARDING); IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); for_each_netdev(net, dev) { struct in_device *in_dev; if (on) dev_disable_lro(dev); in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } } } static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) { if (cnf == net->ipv4.devconf_dflt) return NETCONFA_IFINDEX_DEFAULT; else if (cnf == net->ipv4.devconf_all) return NETCONFA_IFINDEX_ALL; else { struct in_device *idev = container_of(cnf, struct in_device, cnf); return idev->dev->ifindex; } } static int devinet_conf_proc(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); int new_value = *(int *)ctl->data; if (write) { struct ipv4_devconf *cnf = ctl->extra1; struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; int ifindex; set_bit(i, cnf->state); if (cnf == net->ipv4.devconf_dflt) devinet_copy_dflt_conf(net, i); if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); if (i == IPV4_DEVCONF_BC_FORWARDING - 1 && new_value != old_value) rt_cache_flush(net); if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_RP_FILTER, ifindex, cnf); } if (i == IPV4_DEVCONF_PROXY_ARP - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, ifindex, cnf); } if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, ifindex, cnf); } } return ret; } static int devinet_sysctl_forward(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; struct net *net = ctl->extra2; int ret; if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { if (!rtnl_net_trylock(net)) { /* Restore the original values before restarting */ *valp = val; *ppos = pos; return restart_syscall(); } if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); } else { struct ipv4_devconf *cnf = ctl->extra1; struct in_device *idev = container_of(cnf, struct in_device, cnf); if (*valp) dev_disable_lro(idev->dev); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, idev->dev->ifindex, cnf); } rtnl_net_unlock(net); rt_cache_flush(net); } else inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); } return ret; } static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); struct net *net = ctl->extra2; if (write && *valp != val) rt_cache_flush(net); return ret; } #define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \ { \ .procname = name, \ .data = ipv4_devconf.data + \ IPV4_DEVCONF_ ## attr - 1, \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ .extra1 = &ipv4_devconf, \ } #define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc) #define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc) #define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc) #define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush) static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table devinet_vars[IPV4_DEVCONF_MAX]; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"), DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"), DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"), DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"), DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"), DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER, "arp_evict_nocarrier"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, "force_igmp_version"), DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, "drop_unicast_in_l2_multicast"), }, }; static int __devinet_sysctl_register(struct net *net, char *dev_name, int ifindex, struct ipv4_devconf *p) { int i; struct devinet_sysctl_table *t; char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto out; for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; t->devinet_vars[i].extra2 = net; } snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name); t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars); if (!t->sysctl_header) goto free; p->sysctl = t; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, ifindex, p); return 0; free: kfree(t); out: return -ENOMEM; } static void __devinet_sysctl_unregister(struct net *net, struct ipv4_devconf *cnf, int ifindex) { struct devinet_sysctl_table *t = cnf->sysctl; if (t) { cnf->sysctl = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL); } static int devinet_sysctl_register(struct in_device *idev) { int err; if (!sysctl_dev_name_is_allowed(idev->dev->name)) return -EINVAL; err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); if (err) return err; err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, idev->dev->ifindex, &idev->cnf); if (err) neigh_sysctl_unregister(idev->arp_parms); return err; } static void devinet_sysctl_unregister(struct in_device *idev) { struct net *net = dev_net(idev->dev); __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex); neigh_sysctl_unregister(idev->arp_parms); } static struct ctl_table ctl_forward_entry[] = { { .procname = "ip_forward", .data = &ipv4_devconf.data[ IPV4_DEVCONF_FORWARDING - 1], .maxlen = sizeof(int), .mode = 0644, .proc_handler = devinet_sysctl_forward, .extra1 = &ipv4_devconf, .extra2 = &init_net, }, }; #endif static __net_init int devinet_init_net(struct net *net) { #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table *tbl; #endif struct ipv4_devconf *all, *dflt; int err; int i; err = -ENOMEM; net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!net->ipv4.inet_addr_lst) goto err_alloc_hash; all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); if (!all) goto err_alloc_all; dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); if (!dflt) goto err_alloc_dflt; #ifdef CONFIG_SYSCTL tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL); if (!tbl) goto err_alloc_ctl; tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; tbl[0].extra1 = all; tbl[0].extra2 = net; #endif if (!net_eq(net, &init_net)) { switch (net_inherit_devconf()) { case 3: /* copy from the current netns */ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, current->nsproxy->net_ns->ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 0: case 1: /* copy from init_net */ memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 2: /* use compiled values */ break; } } #ifdef CONFIG_SYSCTL err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all); if (err < 0) goto err_reg_all; err = __devinet_sysctl_register(net, "default", NETCONFA_IFINDEX_DEFAULT, dflt); if (err < 0) goto err_reg_dflt; err = -ENOMEM; forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl, ARRAY_SIZE(ctl_forward_entry)); if (!forw_hdr) goto err_reg_ctl; net->ipv4.forw_hdr = forw_hdr; #endif for (i = 0; i < IN4_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime); net->ipv4.devconf_all = all; net->ipv4.devconf_dflt = dflt; return 0; #ifdef CONFIG_SYSCTL err_reg_ctl: __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT); err_reg_dflt: __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: kfree(tbl); err_alloc_ctl: #endif kfree(dflt); err_alloc_dflt: kfree(all); err_alloc_all: kfree(net->ipv4.inet_addr_lst); err_alloc_hash: return err; } static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL const struct ctl_table *tbl; #endif cancel_delayed_work_sync(&net->ipv4.addr_chk_work); #ifdef CONFIG_SYSCTL tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, NETCONFA_IFINDEX_DEFAULT); __devinet_sysctl_unregister(net, net->ipv4.devconf_all, NETCONFA_IFINDEX_ALL); kfree(tbl); #endif kfree(net->ipv4.devconf_dflt); kfree(net->ipv4.devconf_all); kfree(net->ipv4.inet_addr_lst); } static __net_initdata struct pernet_operations devinet_ops = { .init = devinet_init_net, .exit = devinet_exit_net, }; static struct rtnl_af_ops inet_af_ops __read_mostly = { .family = AF_INET, .fill_link_af = inet_fill_link_af, .get_link_af_size = inet_get_link_af_size, .validate_link_af = inet_validate_link_af, .set_link_af = inet_set_link_af, }; static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST, .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; void __init devinet_init(void) { register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); if (rtnl_af_register(&inet_af_ops)) panic("Unable to register inet_af_ops\n"); rtnl_register_many(devinet_rtnl_msg_handlers); } |
| 7379 7385 7380 7413 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/uaccess.h> #include <linux/kernel.h> #include <asm/vsyscall.h> #ifdef CONFIG_X86_64 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { unsigned long vaddr = (unsigned long)unsafe_src; /* * Do not allow userspace addresses. This disallows * normal userspace and the userspace guard page: */ if (vaddr < TASK_SIZE_MAX + PAGE_SIZE) return false; /* * Reading from the vsyscall page may cause an unhandled fault in * certain cases. Though it is at an address above TASK_SIZE_MAX, it is * usually considered as a user space address. */ if (is_vsyscall_vaddr(vaddr)) return false; /* * Allow everything during early boot before 'x86_virt_bits' * is initialized. Needed for instruction decoding in early * exception handlers. */ if (!boot_cpu_data.x86_virt_bits) return true; return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits); } #else bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return (unsigned long)unsafe_src >= TASK_SIZE_MAX; } #endif |
| 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM icmp #if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ICMP_H #include <linux/icmp.h> #include <linux/tracepoint.h> TRACE_EVENT(icmp_send, TP_PROTO(const struct sk_buff *skb, int type, int code), TP_ARGS(skb, type, code), TP_STRUCT__entry( __field(const void *, skbaddr) __field(int, type) __field(int, code) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __field(__u16, sport) __field(__u16, dport) __field(unsigned short, ulen) ), TP_fast_assign( struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = udp_hdr(skb); int proto_4 = iph->protocol; __be32 *p32; __entry->skbaddr = skb; __entry->type = type; __entry->code = code; if (proto_4 != IPPROTO_UDP || (u8 *)uh < skb->head || (u8 *)uh + sizeof(struct udphdr) > skb_tail_pointer(skb)) { __entry->sport = 0; __entry->dport = 0; __entry->ulen = 0; } else { __entry->sport = ntohs(uh->source); __entry->dport = ntohs(uh->dest); __entry->ulen = ntohs(uh->len); } p32 = (__be32 *) __entry->saddr; *p32 = iph->saddr; p32 = (__be32 *) __entry->daddr; *p32 = iph->daddr; ), TP_printk("icmp_send: type=%d, code=%d. From %pI4:%u to %pI4:%u ulen=%d skbaddr=%p", __entry->type, __entry->code, __entry->saddr, __entry->sport, __entry->daddr, __entry->dport, __entry->ulen, __entry->skbaddr) ); #endif /* _TRACE_ICMP_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 237 257 257 238 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 Google LLC */ /* * Refer to Documentation/block/inline-encryption.rst for detailed explanation. */ #define pr_fmt(fmt) "blk-crypto: " fmt #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-crypto-profile.h> #include <linux/module.h> #include <linux/ratelimit.h> #include <linux/slab.h> #include "blk-crypto-internal.h" const struct blk_crypto_mode blk_crypto_modes[] = { [BLK_ENCRYPTION_MODE_AES_256_XTS] = { .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, .security_strength = 32, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .security_strength = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .security_strength = 32, .ivsize = 32, }, [BLK_ENCRYPTION_MODE_SM4_XTS] = { .name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, .security_strength = 16, .ivsize = 16, }, }; /* * This number needs to be at least (the number of threads doing IO * concurrently) * (maximum recursive depth of a bio), so that we don't * deadlock on crypt_ctx allocations. The default is chosen to be the same * as the default number of post read contexts in both EXT4 and F2FS. */ static int num_prealloc_crypt_ctxs = 128; module_param(num_prealloc_crypt_ctxs, int, 0444); MODULE_PARM_DESC(num_prealloc_crypt_ctxs, "Number of bio crypto contexts to preallocate"); static struct kmem_cache *bio_crypt_ctx_cache; static mempool_t *bio_crypt_ctx_pool; static int __init bio_crypt_ctx_init(void) { size_t i; bio_crypt_ctx_cache = KMEM_CACHE(bio_crypt_ctx, 0); if (!bio_crypt_ctx_cache) goto out_no_mem; bio_crypt_ctx_pool = mempool_create_slab_pool(num_prealloc_crypt_ctxs, bio_crypt_ctx_cache); if (!bio_crypt_ctx_pool) goto out_no_mem; /* This is assumed in various places. */ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); /* * Validate the crypto mode properties. This ideally would be done with * static assertions, but boot-time checks are the next best thing. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_RAW_KEY_SIZE); BUG_ON(blk_crypto_modes[i].security_strength > blk_crypto_modes[i].keysize); BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); } return 0; out_no_mem: panic("Failed to allocate mem for bio crypt ctxs\n"); } subsys_initcall(bio_crypt_ctx_init); void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask) { struct bio_crypt_ctx *bc; /* * The caller must use a gfp_mask that contains __GFP_DIRECT_RECLAIM so * that the mempool_alloc() can't fail. */ WARN_ON_ONCE(!(gfp_mask & __GFP_DIRECT_RECLAIM)); bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); bc->bc_key = key; memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun)); bio->bi_crypt_context = bc; } void __bio_crypt_free_ctx(struct bio *bio) { mempool_free(bio->bi_crypt_context, bio_crypt_ctx_pool); bio->bi_crypt_context = NULL; } int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) { dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); if (!dst->bi_crypt_context) return -ENOMEM; *dst->bi_crypt_context = *src->bi_crypt_context; return 0; } /* Increments @dun by @inc, treating @dun as a multi-limb integer. */ void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc) { int i; for (i = 0; inc && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { dun[i] += inc; /* * If the addition in this limb overflowed, then we need to * carry 1 into the next limb. Else the carry is 0. */ if (dun[i] < inc) inc = 1; else inc = 0; } } void __bio_crypt_advance(struct bio *bio, unsigned int bytes) { struct bio_crypt_ctx *bc = bio->bi_crypt_context; bio_crypt_dun_increment(bc->bc_dun, bytes >> bc->bc_key->data_unit_size_bits); } /* * Returns true if @bc->bc_dun plus @bytes converted to data units is equal to * @next_dun, treating the DUNs as multi-limb integers. */ bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]) { int i; unsigned int carry = bytes >> bc->bc_key->data_unit_size_bits; for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { if (bc->bc_dun[i] + carry != next_dun[i]) return false; /* * If the addition in this limb overflowed, then we need to * carry 1 into the next limb. Else the carry is 0. */ if ((bc->bc_dun[i] + carry) < carry) carry = 1; else carry = 0; } /* If the DUN wrapped through 0, don't treat it as contiguous. */ return carry == 0; } /* * Checks that two bio crypt contexts are compatible - i.e. that * they are mergeable except for data_unit_num continuity. */ static bool bio_crypt_ctx_compatible(struct bio_crypt_ctx *bc1, struct bio_crypt_ctx *bc2) { if (!bc1) return !bc2; return bc2 && bc1->bc_key == bc2->bc_key; } bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { return bio_crypt_ctx_compatible(rq->crypt_ctx, bio->bi_crypt_context); } /* * Checks that two bio crypt contexts are compatible, and also * that their data_unit_nums are continuous (and can hence be merged) * in the order @bc1 followed by @bc2. */ bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, struct bio_crypt_ctx *bc2) { if (!bio_crypt_ctx_compatible(bc1, bc2)) return false; return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun); } /* Check that all I/O segments are data unit aligned. */ static bool bio_crypt_check_alignment(struct bio *bio) { const unsigned int data_unit_size = bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size; struct bvec_iter iter; struct bio_vec bv; bio_for_each_segment(bv, bio, iter) { if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size)) return false; } return true; } blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq) { return blk_crypto_get_keyslot(rq->q->crypto_profile, rq->crypt_ctx->bc_key, &rq->crypt_keyslot); } void __blk_crypto_rq_put_keyslot(struct request *rq) { blk_crypto_put_keyslot(rq->crypt_keyslot); rq->crypt_keyslot = NULL; } void __blk_crypto_free_request(struct request *rq) { /* The keyslot, if one was needed, should have been released earlier. */ if (WARN_ON_ONCE(rq->crypt_keyslot)) __blk_crypto_rq_put_keyslot(rq); mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); rq->crypt_ctx = NULL; } /** * __blk_crypto_bio_prep - Prepare bio for inline encryption * * @bio_ptr: pointer to original bio pointer * * If the bio crypt context provided for the bio is supported by the underlying * device's inline encryption hardware, do nothing. * * Otherwise, try to perform en/decryption for this bio by falling back to the * kernel crypto API. When the crypto API fallback is used for encryption, * blk-crypto may choose to split the bio into 2 - the first one that will * continue to be processed and the second one that will be resubmitted via * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents * of the aforementioned "first one", and *bio_ptr will be updated to this * bounce bio. * * Caller must ensure bio has bio_crypt_ctx. * * Return: true on success; false on error (and bio->bi_status will be set * appropriately, and bio_endio() will have been called so bio * submission should abort). */ bool __blk_crypto_bio_prep(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; /* Error if bio has no data. */ if (WARN_ON_ONCE(!bio_has_data(bio))) { bio->bi_status = BLK_STS_IOERR; goto fail; } if (!bio_crypt_check_alignment(bio)) { bio->bi_status = BLK_STS_INVAL; goto fail; } /* * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ if (blk_crypto_config_supported_natively(bio->bi_bdev, &bc_key->crypto_cfg)) return true; if (blk_crypto_fallback_bio_prep(bio_ptr)) return true; fail: bio_endio(*bio_ptr); return false; } int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) { if (!rq->crypt_ctx) { rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); if (!rq->crypt_ctx) return -ENOMEM; } *rq->crypt_ctx = *bio->bi_crypt_context; return 0; } /** * blk_crypto_init_key() - Prepare a key for use with blk-crypto * @blk_key: Pointer to the blk_crypto_key to initialize. * @key_bytes: the bytes of the key * @key_size: size of the key in bytes * @key_type: type of the key -- either raw or hardware-wrapped * @crypto_mode: identifier for the encryption algorithm to use * @dun_bytes: number of bytes that will be used to specify the DUN when this * key is used * @data_unit_size: the data unit size to use for en/decryption * * Return: 0 on success, -errno on failure. The caller is responsible for * zeroizing both blk_key and key_bytes when done with them. */ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *key_bytes, size_t key_size, enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size) { const struct blk_crypto_mode *mode; memset(blk_key, 0, sizeof(*blk_key)); if (crypto_mode >= ARRAY_SIZE(blk_crypto_modes)) return -EINVAL; mode = &blk_crypto_modes[crypto_mode]; switch (key_type) { case BLK_CRYPTO_KEY_TYPE_RAW: if (key_size != mode->keysize) return -EINVAL; break; case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED: if (key_size < mode->security_strength || key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE) return -EINVAL; break; default: return -EINVAL; } if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; if (!is_power_of_2(data_unit_size)) return -EINVAL; blk_key->crypto_cfg.crypto_mode = crypto_mode; blk_key->crypto_cfg.dun_bytes = dun_bytes; blk_key->crypto_cfg.data_unit_size = data_unit_size; blk_key->crypto_cfg.key_type = key_type; blk_key->data_unit_size_bits = ilog2(data_unit_size); blk_key->size = key_size; memcpy(blk_key->bytes, key_bytes, key_size); return 0; } bool blk_crypto_config_supported_natively(struct block_device *bdev, const struct blk_crypto_config *cfg) { return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, cfg); } /* * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the * block_device it's submitted to supports inline crypto, or the * blk-crypto-fallback is enabled and supports the cfg). */ bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) && cfg->key_type == BLK_CRYPTO_KEY_TYPE_RAW) return true; return blk_crypto_config_supported_natively(bdev, cfg); } /** * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device * @bdev: block device to operate on * @key: A key to use on the device * * Upper layers must call this function to ensure that either the hardware * supports the key's crypto settings, or the crypto API fallback has transforms * for the needed mode allocated and ready to go. This function may allocate * an skcipher, and *should not* be called from the data path, since that might * cause a deadlock * * Return: 0 on success; -EOPNOTSUPP if the key is wrapped but the hardware does * not support wrapped keys; -ENOPKG if the key is a raw key but the * hardware does not support raw keys and blk-crypto-fallback is either * disabled or the needed algorithm is disabled in the crypto API; or * another -errno code if something else went wrong. */ int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key) { if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_RAW) { pr_warn_ratelimited("%pg: no support for wrapped keys\n", bdev); return -EOPNOTSUPP; } return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } /** * blk_crypto_evict_key() - Evict a blk_crypto_key from a block_device * @bdev: a block_device on which I/O using the key may have been done * @key: the key to evict * * For a given block_device, this function removes the given blk_crypto_key from * the keyslot management structures and evicts it from any underlying hardware * keyslot(s) or blk-crypto-fallback keyslot it may have been programmed into. * * Upper layers must call this before freeing the blk_crypto_key. It must be * called for every block_device the key may have been used on. The key must no * longer be in use by any I/O when this function is called. * * Context: May sleep. */ void blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key) { struct request_queue *q = bdev_get_queue(bdev); int err; if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) err = __blk_crypto_evict_key(q->crypto_profile, key); else err = blk_crypto_fallback_evict_key(key); /* * An error can only occur here if the key failed to be evicted from a * keyslot (due to a hardware or driver issue) or is allegedly still in * use by I/O (due to a kernel bug). Even in these cases, the key is * still unlinked from the keyslot management structures, and the caller * is allowed and expected to free it right away. There's nothing * callers can do to handle errors, so just log them and return void. */ if (err) pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } EXPORT_SYMBOL_GPL(blk_crypto_evict_key); static int blk_crypto_ioctl_import_key(struct blk_crypto_profile *profile, void __user *argp) { struct blk_crypto_import_key_arg arg; u8 raw_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE]; u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; int ret; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) return -EINVAL; if (arg.raw_key_size < 16 || arg.raw_key_size > sizeof(raw_key)) return -EINVAL; if (copy_from_user(raw_key, u64_to_user_ptr(arg.raw_key_ptr), arg.raw_key_size)) { ret = -EFAULT; goto out; } ret = blk_crypto_import_key(profile, raw_key, arg.raw_key_size, lt_key); if (ret < 0) goto out; if (ret > arg.lt_key_size) { ret = -EOVERFLOW; goto out; } arg.lt_key_size = ret; if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, arg.lt_key_size) || copy_to_user(argp, &arg, sizeof(arg))) { ret = -EFAULT; goto out; } ret = 0; out: memzero_explicit(raw_key, sizeof(raw_key)); memzero_explicit(lt_key, sizeof(lt_key)); return ret; } static int blk_crypto_ioctl_generate_key(struct blk_crypto_profile *profile, void __user *argp) { struct blk_crypto_generate_key_arg arg; u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; int ret; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) return -EINVAL; ret = blk_crypto_generate_key(profile, lt_key); if (ret < 0) goto out; if (ret > arg.lt_key_size) { ret = -EOVERFLOW; goto out; } arg.lt_key_size = ret; if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, arg.lt_key_size) || copy_to_user(argp, &arg, sizeof(arg))) { ret = -EFAULT; goto out; } ret = 0; out: memzero_explicit(lt_key, sizeof(lt_key)); return ret; } static int blk_crypto_ioctl_prepare_key(struct blk_crypto_profile *profile, void __user *argp) { struct blk_crypto_prepare_key_arg arg; u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; int ret; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) return -EINVAL; if (arg.lt_key_size > sizeof(lt_key)) return -EINVAL; if (copy_from_user(lt_key, u64_to_user_ptr(arg.lt_key_ptr), arg.lt_key_size)) { ret = -EFAULT; goto out; } ret = blk_crypto_prepare_key(profile, lt_key, arg.lt_key_size, eph_key); if (ret < 0) goto out; if (ret > arg.eph_key_size) { ret = -EOVERFLOW; goto out; } arg.eph_key_size = ret; if (copy_to_user(u64_to_user_ptr(arg.eph_key_ptr), eph_key, arg.eph_key_size) || copy_to_user(argp, &arg, sizeof(arg))) { ret = -EFAULT; goto out; } ret = 0; out: memzero_explicit(lt_key, sizeof(lt_key)); memzero_explicit(eph_key, sizeof(eph_key)); return ret; } int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, void __user *argp) { struct blk_crypto_profile *profile = bdev_get_queue(bdev)->crypto_profile; if (!profile) return -EOPNOTSUPP; switch (cmd) { case BLKCRYPTOIMPORTKEY: return blk_crypto_ioctl_import_key(profile, argp); case BLKCRYPTOGENERATEKEY: return blk_crypto_ioctl_generate_key(profile, argp); case BLKCRYPTOPREPAREKEY: return blk_crypto_ioctl_prepare_key(profile, argp); default: return -ENOTTY; } } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* md.h : kernel internal structure of the Linux MD driver Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman */ #ifndef _MD_MD_H #define _MD_MD_H #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/badblocks.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <linux/raid/md_u.h> #include <trace/events/block.h> #define MaxSector (~(sector_t)0) enum md_submodule_type { MD_PERSONALITY = 0, MD_CLUSTER, MD_BITMAP, }; enum md_submodule_id { ID_LINEAR = LEVEL_LINEAR, ID_RAID0 = 0, ID_RAID1 = 1, ID_RAID4 = 4, ID_RAID5 = 5, ID_RAID6 = 6, ID_RAID10 = 10, ID_CLUSTER, ID_BITMAP, ID_LLBITMAP, ID_BITMAP_NONE, }; struct md_submodule_head { enum md_submodule_type type; enum md_submodule_id id; const char *name; struct module *owner; }; /* * These flags should really be called "NO_RETRY" rather than * "FAILFAST" because they don't make any promise about time lapse, * only about the number of retries, which will be zero. * REQ_FAILFAST_DRIVER is not included because * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.") * seems to suggest that the errors it avoids retrying should usually * be retried. */ #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) /* Status of sync thread. */ enum sync_action { /* * Represent by MD_RECOVERY_SYNC, start when: * 1) after assemble, sync data from first rdev to other copies, this * must be done first before other sync actions and will only execute * once; * 2) resize the array(notice that this is not reshape), sync data for * the new range; */ ACTION_RESYNC, /* * Represent by MD_RECOVERY_RECOVER, start when: * 1) for new replacement, sync data based on the replace rdev or * available copies from other rdev; * 2) for new member disk while the array is degraded, sync data from * other rdev; * 3) reassemble after power failure or re-add a hot removed rdev, sync * data from first rdev to other copies based on bitmap; */ ACTION_RECOVER, /* * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED | * MD_RECOVERY_CHECK, start when user echo "check" to sysfs api * sync_action, used to check if data copies from differenct rdev are * the same. The number of mismatch sectors will be exported to user * by sysfs api mismatch_cnt; */ ACTION_CHECK, /* * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED, start when * user echo "repair" to sysfs api sync_action, usually paired with * ACTION_CHECK, used to force syncing data once user found that there * are inconsistent data, */ ACTION_REPAIR, /* * Represent by MD_RECOVERY_RESHAPE, start when new member disk is added * to the conf, notice that this is different from spares or * replacement; */ ACTION_RESHAPE, /* * Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action * or internal usage like setting the array read-only, will forbid above * actions. */ ACTION_FROZEN, /* * All above actions don't match. */ ACTION_IDLE, NR_SYNC_ACTIONS, }; /* * The struct embedded in rdev is used to serialize IO. */ struct serial_in_rdev { struct rb_root_cached serial_rb; spinlock_t serial_lock; wait_queue_head_t serial_io_wait; }; /* * MD's 'extended' device */ struct md_rdev { struct list_head same_set; /* RAID devices within the same set */ sector_t sectors; /* Device size (in 512bytes sectors) */ struct mddev *mddev; /* RAID array if running */ unsigned long last_events; /* IO event timestamp */ /* * If meta_bdev is non-NULL, it means that a separate device is * being used to store the metadata (superblock/bitmap) which * would otherwise be contained on the same device as the data (bdev). */ struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ struct file *bdev_file; /* Handle from open for bdev */ struct page *sb_page, *bb_page; int sb_loaded; __u64 sb_events; sector_t data_offset; /* start of data in array */ sector_t new_data_offset;/* only relevant while reshaping */ sector_t sb_start; /* offset of the super block (in 512byte sectors) */ int sb_size; /* bytes in the superblock */ int preferred_minor; /* autorun support */ struct kobject kobj; /* A device can be in one of three states based on two flags: * Not working: faulty==1 in_sync==0 * Fully working: faulty==0 in_sync==1 * Working, but not * in sync with array * faulty==0 in_sync==0 * * It can never have faulty==1, in_sync==1 * This reduces the burden of testing multiple flags in many cases */ unsigned long flags; /* bit set of 'enum flag_bits' bits. */ wait_queue_head_t blocked_wait; int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ int new_raid_disk; /* role that the device will have in * the array after a level-change completes. */ int saved_raid_disk; /* role that device used to have in the * array and could again if we did a partial * resync from the bitmap */ union { sector_t recovery_offset;/* If this device has been partially * recovered, this is where we were * up to. */ sector_t journal_tail; /* If this device is a journal device, * this is the journal tail (journal * recovery start point) */ }; atomic_t nr_pending; /* number of pending requests. * only maintained for arrays that * support hot removal */ atomic_t read_errors; /* number of consecutive read errors that * we have tried to ignore. */ time64_t last_read_error; /* monotonic time since our * last read error */ atomic_t corrected_errors; /* number of corrected read errors, * for reporting to userspace and storing * in superblock. */ struct serial_in_rdev *serial; /* used for raid1 io serialization */ struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ struct kernfs_node *sysfs_unack_badblocks; /* handle for 'bad_blocks' sysfs dentry */ struct kernfs_node *sysfs_badblocks; struct badblocks badblocks; struct { short offset; /* Offset from superblock to start of PPL. * Not used by external metadata. */ unsigned int size; /* Size in sectors of the PPL space */ sector_t sector; /* First sector of the PPL space */ } ppl; }; enum flag_bits { Faulty, /* device is known to have a fault */ In_sync, /* device is in_sync with rest of array */ Bitmap_sync, /* ..actually, not quite In_sync. Need a * bitmap-based recovery to get fully in sync. * The bit is only meaningful before device * has been passed to pers->hot_add_disk. */ WriteMostly, /* Avoid reading if at all possible */ AutoDetected, /* added by auto-detect */ Blocked, /* An error occurred but has not yet * been acknowledged by the metadata * handler, so don't allow writes * until it is cleared */ WriteErrorSeen, /* A write error has been seen on this * device */ FaultRecorded, /* Intermediate state for clearing * Blocked. The Fault is/will-be * recorded in the metadata, but that * metadata hasn't been stored safely * on disk yet. */ BlockedBadBlocks, /* A writer is blocked because they * found an unacknowledged bad-block. * This can safely be cleared at any * time, and the writer will re-check. * It may be set at any time, and at * worst the writer will timeout and * re-check. So setting it as * accurately as possible is good, but * not absolutely critical. */ WantReplacement, /* This device is a candidate to be * hot-replaced, either because it has * reported some faults, or because * of explicit request. */ Replacement, /* This device is a replacement for * a want_replacement device with same * raid_disk number. */ Candidate, /* For clustered environments only: * This device is seen locally but not * by the whole cluster */ Journal, /* This device is used as journal for * raid-5/6. * Usually, this device should be faster * than other devices in the array */ ClusterRemove, ExternalBbl, /* External metadata provides bad * block management for a disk */ FailFast, /* Minimal retries should be attempted on * this device, so use REQ_FAILFAST_DEV. * Also don't try to repair failed reads. * It is expects that no bad block log * is present. */ LastDev, /* Seems to be the last working dev as * it didn't fail, so don't use FailFast * any more for metadata */ CollisionCheck, /* * check if there is collision between raid1 * serial bios. */ Nonrot, /* non-rotational device (SSD) */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors, sector_t *first_bad, sector_t *bad_sectors) { if (unlikely(rdev->badblocks.count)) { int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, sectors, first_bad, bad_sectors); if (rv) *first_bad -= rdev->data_offset; return rv; } return 0; } static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, int sectors) { sector_t first_bad; sector_t bad_sectors; return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); } extern bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); extern void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); struct md_cluster_info; struct md_cluster_operations; /** * enum mddev_flags - md device flags. * @MD_ARRAY_FIRST_USE: First use of array, needs initialization. * @MD_CLOSING: If set, we are closing the array, do not open it then. * @MD_JOURNAL_CLEAN: A raid with journal is already clean. * @MD_HAS_JOURNAL: The raid array has journal feature set. * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took * resync lock, need to release the lock. * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as * calls to md_error() will never cause the array to * become failed. * @MD_HAS_PPL: The raid array has PPL feature set. * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set. * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. * @MD_DELETED: This device is being deleted * * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ enum mddev_flags { MD_ARRAY_FIRST_USE, MD_CLOSING, MD_JOURNAL_CLEAN, MD_HAS_JOURNAL, MD_CLUSTER_RESYNC_LOCKED, MD_FAILFAST_SUPPORTED, MD_HAS_PPL, MD_HAS_MULTIPLE_PPLS, MD_NOT_READY, MD_BROKEN, MD_DELETED, }; enum mddev_sb_flags { MD_SB_CHANGE_DEVS, /* Some device status has changed */ MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */ MD_SB_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */ MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; #define NR_SERIAL_INFOS 8 /* record current range of serialize IOs */ struct serial_info { struct rb_node node; sector_t start; /* start sector of rb node */ sector_t last; /* end sector of rb node */ sector_t _subtree_last; /* highest sector in subtree of rb node */ }; /* * mddev->curr_resync stores the current sector of the resync but * also has some overloaded values. */ enum { /* No resync in progress */ MD_RESYNC_NONE = 0, /* Yielded to allow another conflicting resync to commence */ MD_RESYNC_YIELDED = 1, /* Delayed to check that there is no conflict with another sync */ MD_RESYNC_DELAYED = 2, /* Any value greater than or equal to this is in an active resync */ MD_RESYNC_ACTIVE = 3, }; struct mddev { void *private; struct md_personality *pers; dev_t unit; int md_minor; struct list_head disks; unsigned long flags; unsigned long sb_flags; int suspended; struct mutex suspend_mutex; struct percpu_ref active_io; int ro; int sysfs_active; /* set when sysfs deletes * are happening, so run/ * takeover/stop are not safe */ struct gendisk *gendisk; /* mdraid gendisk */ struct gendisk *dm_gendisk; /* dm-raid gendisk */ struct kobject kobj; int hold_active; #define UNTIL_IOCTL 1 #define UNTIL_STOP 2 /* Superblock information */ int major_version, minor_version, patch_version; int persistent; int external; /* metadata is * managed externally */ char metadata_type[17]; /* externally set*/ int chunk_sectors; time64_t ctime, utime; int level, layout; char clevel[16]; int raid_disks; int max_disks; sector_t dev_sectors; /* used size of * component devices */ sector_t array_sectors; /* exported array size */ int external_size; /* size managed * externally */ __u64 events; /* If the last 'event' was simply a clean->dirty transition, and * we didn't write it to the spares, then it is safe and simple * to just decrement the event count on a dirty->clean transition. * So we record that possibility here. */ int can_decrease_events; char uuid[16]; /* If the array is being reshaped, we need to record the * new shape and an indication of where we are up to. * This is written to the superblock. * If reshape_position is MaxSector, then no reshape is happening (yet). */ sector_t reshape_position; int delta_disks, new_level, new_layout; int new_chunk_sectors; int reshape_backwards; struct md_thread __rcu *thread; /* management thread */ struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ /* * Set when a sync operation is started. It holds this value even * when the sync thread is "frozen" (interrupted) or "idle" (stopped * or finished). It is overwritten when a new sync operation is begun. */ enum sync_action last_sync_action; sector_t curr_resync; /* last block scheduled */ /* As resync requests can complete out of order, we cannot easily track * how much resync has been completed. So we occasionally pause until * everything completes, then set curr_resync_completed to curr_resync. * As such it may be well behind the real resync mark, but it is a value * we are certain of. */ sector_t curr_resync_completed; unsigned long resync_mark; /* a recent timestamp */ sector_t resync_mark_cnt;/* blocks written at resync_mark */ sector_t curr_mark_cnt; /* blocks scheduled now */ sector_t resync_max_sectors; /* may be set by personality */ atomic64_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ /* allow user-space to request suspension of IO to regions of the array */ sector_t suspend_lo; sector_t suspend_hi; /* if zero, use the system-wide default */ int sync_speed_min; int sync_speed_max; int sync_io_depth; /* resync even though the same disks are shared among md-devices */ int parallel_resync; int ok_start_degraded; unsigned long recovery; /* If a RAID personality determines that recovery (of a particular * device) will fail due to a read error on the source device, it * takes a copy of this number and does not attempt recovery again * until this number changes. */ int recovery_disabled; int in_sync; /* know to not need resync */ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so * that we are never stopping an array while it is open. * 'reconfig_mutex' protects all other reconfiguration. * These locks are separate due to conflicting interactions * with disk->open_mutex. * Lock ordering is: * reconfig_mutex -> disk->open_mutex * disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open */ struct mutex open_mutex; struct mutex reconfig_mutex; atomic_t active; /* general refcount */ atomic_t openers; /* number of active opens */ int changed; /* True if we might need to * reread partition info */ int degraded; /* whether md should consider * adding a spare */ unsigned long normal_io_events; /* IO event timestamp */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; sector_t resync_offset; sector_t resync_min; /* user requested sync * starts here */ sector_t resync_max; /* resync should pause * when it gets here */ struct kernfs_node *sysfs_state; /* handle for 'array_state' * file in sysfs. */ struct kernfs_node *sysfs_action; /* handle for 'sync_action' */ struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */ struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ struct kernfs_node *sysfs_level; /*handle for 'level' */ /* used for delayed sysfs removal */ struct work_struct del_work; /* used for register new sync thread */ struct work_struct sync_work; /* "lock" protects: * flush_bio transition from NULL to !NULL * rdev superblocks, events * clearing MD_CHANGE_* * in_sync - and related safemode and MD_CHANGE changes * pers (also protected by reconfig_mutex and pending IO). * clearing ->bitmap * clearing ->bitmap_info.file * changing ->resync_{min,max} * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max}) */ spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ atomic_t pending_writes; /* number of active superblock writes */ unsigned int safemode; /* if set, update "clean" superblock * when no writes pending. */ unsigned int safemode_delay; struct timer_list safemode_timer; struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ enum md_submodule_id bitmap_id; void *bitmap; /* the bitmap for the device */ struct bitmap_operations *bitmap_ops; struct { struct file *file; /* the bitmap file */ loff_t offset; /* offset from superblock of * start of bitmap. May be * negative, but not '0' * For external metadata, offset * from start of device. */ unsigned long space; /* space available at this offset */ loff_t default_offset; /* this is the offset to use when * hot-adding a bitmap. It should * eventually be settable by sysfs. */ unsigned long default_space; /* space available at * default offset */ struct mutex mutex; unsigned long chunksize; unsigned long daemon_sleep; /* how many jiffies between updates? */ unsigned long max_write_behind; /* write-behind mode */ int external; int nodes; /* Maximum number of nodes in the cluster */ char cluster_name[64]; /* Name of the cluster */ } bitmap_info; atomic_t max_corr_read_errors; /* max read retries */ struct list_head all_mddevs; const struct attribute_group *to_remove; struct bio_set bio_set; struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ struct bio_set io_clone_set; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; struct md_cluster_operations *cluster_ops; unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ /* * Temporarily store rdev that will be finally removed when * reconfig_mutex is unlocked, protected by reconfig_mutex. */ struct list_head deleting; /* The sequence number for sync thread */ atomic_t sync_seq; bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; }; enum recovery_flags { /* flags for sync thread running status */ /* * set when one of sync action is set and new sync thread need to be * registered, or just add/remove spares from conf. */ MD_RECOVERY_NEEDED, /* sync thread is running, or about to be started */ MD_RECOVERY_RUNNING, /* sync thread needs to be aborted for some reason */ MD_RECOVERY_INTR, /* sync thread is done and is waiting to be unregistered */ MD_RECOVERY_DONE, /* running sync thread must abort immediately, and not restart */ MD_RECOVERY_FROZEN, /* waiting for pers->start() to finish */ MD_RECOVERY_WAIT, /* interrupted because io-error */ MD_RECOVERY_ERROR, /* flags determines sync action, see details in enum sync_action */ /* if just this flag is set, action is resync. */ MD_RECOVERY_SYNC, /* * paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set, * action is repair, means user requested resync. */ MD_RECOVERY_REQUESTED, /* * paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is * check. */ MD_RECOVERY_CHECK, /* recovery, or need to try it */ MD_RECOVERY_RECOVER, /* reshape */ MD_RECOVERY_RESHAPE, /* remote node is running resync thread */ MD_RESYNCING_REMOTE, /* raid456 lazy initial recover */ MD_RECOVERY_LAZY_RECOVER, }; enum md_ro_state { MD_RDWR, MD_RDONLY, MD_AUTO_READ, MD_MAX_STATE }; static inline bool md_is_rdwr(struct mddev *mddev) { return (mddev->ro == MD_RDWR); } static inline bool reshape_interrupted(struct mddev *mddev) { /* reshape never start */ if (mddev->reshape_position == MaxSector) return false; /* interrupted */ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return true; /* running reshape will be interrupted soon. */ if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || test_bit(MD_RECOVERY_INTR, &mddev->recovery) || test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) return true; return false; } static inline int __must_check mddev_lock(struct mddev *mddev) { int ret; ret = mutex_lock_interruptible(&mddev->reconfig_mutex); /* MD_DELETED is set in do_md_stop with reconfig_mutex. * So check it here. */ if (!ret && test_bit(MD_DELETED, &mddev->flags)) { ret = -ENODEV; mutex_unlock(&mddev->reconfig_mutex); } return ret; } /* Sometimes we need to take the lock in a situation where * failure due to interrupts is not acceptable. * It doesn't need to check MD_DELETED here, the owner which * holds the lock here can't be stopped. And all paths can't * call this function after do_md_stop. */ static inline void mddev_lock_nointr(struct mddev *mddev) { mutex_lock(&mddev->reconfig_mutex); } static inline int mddev_trylock(struct mddev *mddev) { int ret; ret = mutex_trylock(&mddev->reconfig_mutex); if (!ret && test_bit(MD_DELETED, &mddev->flags)) { ret = -ENODEV; mutex_unlock(&mddev->reconfig_mutex); } return ret; } extern void mddev_unlock(struct mddev *mddev); struct md_personality { struct md_submodule_head head; bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that * requires md_thread should go into start() */ int (*run)(struct mddev *mddev); /* start up works that require md threads */ int (*start)(struct mddev *mddev); void (*free)(struct mddev *mddev, void *priv); void (*status)(struct seq_file *seq, struct mddev *mddev); /* error_handler must set ->faulty and clear ->in_sync * if appropriate, and should abort recovery if needed */ void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*spare_active) (struct mddev *mddev); sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, sector_t max_sector, int *skipped); int (*resize) (struct mddev *mddev, sector_t sectors); sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour */ void (*quiesce) (struct mddev *mddev, int quiesce); /* takeover is used to transition an array from one * personality to another. The new personality must be able * to handle the data in the current layout. * e.g. 2drive raid1 -> 2drive raid5 * ndrive raid5 -> degraded n+1drive raid6 with special layout * If the takeover succeeds, a new 'private' structure is returned. * This needs to be installed and then ->run used to activate the * array. */ void *(*takeover) (struct mddev *mddev); /* Changes the consistency policy of an active array. */ int (*change_consistency_policy)(struct mddev *mddev, const char *buf); /* convert io ranges from array to bitmap */ void (*bitmap_sector)(struct mddev *mddev, sector_t *offset, unsigned long *sectors); }; struct md_sysfs_entry { struct attribute attr; ssize_t (*show)(struct mddev *, char *); ssize_t (*store)(struct mddev *, const char *, size_t); }; static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) { if (sd) return sysfs_get_dirent(sd, name); return sd; } static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd) { if (sd) sysfs_notify_dirent(sd); } static inline char * mdname (struct mddev * mddev) { return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; } static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; if (!test_bit(Replacement, &rdev->flags) && !test_bit(Journal, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); } else return 0; } static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; if (!test_bit(Replacement, &rdev->flags) && !test_bit(Journal, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&mddev->kobj, nm); } } /* * iterates through some rdev ringlist. It's safe to remove the * current 'rdev'. Dont touch 'tmp' though. */ #define rdev_for_each_list(rdev, tmp, head) \ list_for_each_entry_safe(rdev, tmp, head, same_set) /* * iterates through the 'same array disks' ringlist */ #define rdev_for_each(rdev, mddev) \ list_for_each_entry(rdev, &((mddev)->disks), same_set) #define rdev_for_each_safe(rdev, tmp, mddev) \ list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) #define rdev_for_each_rcu(rdev, mddev) \ list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) struct md_thread { void (*run) (struct md_thread *thread); struct mddev *mddev; wait_queue_head_t wqueue; unsigned long flags; struct task_struct *tsk; unsigned long timeout; void *private; }; struct md_io_clone { struct mddev *mddev; struct bio *orig_bio; unsigned long start_time; sector_t offset; unsigned long sectors; enum stat_group rw; struct bio bio_clone; }; #define THREAD_WAKEUP 0 static inline void safe_put_page(struct page *p) { if (p) put_page(p); } int register_md_submodule(struct md_submodule_head *msh); void unregister_md_submodule(struct md_submodule_head *msh); extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp); extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern enum sync_action md_sync_action(struct mddev *mddev); extern enum sync_action md_sync_action_by_name(const char *page); extern const char *md_sync_action_name(enum sync_action action); extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); void md_account_bio(struct mddev *mddev, struct bio **bio); void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page, unsigned int offset); extern int md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op); extern void md_do_sync(struct md_thread *thread); extern void md_new_event(void); extern void md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern int mddev_init(struct mddev *mddev); extern void mddev_destroy(struct mddev *mddev); void md_init_stacking_limits(struct queue_limits *lim); struct mddev *md_alloc(dev_t dev, char *name); void mddev_put(struct mddev *mddev); extern int md_run(struct mddev *mddev); extern int md_start(struct mddev *mddev); extern void md_stop(struct mddev *mddev); extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); extern bool md_handle_request(struct mddev *mddev, struct bio *bio); extern int mddev_suspend(struct mddev *mddev, bool interruptible); extern void mddev_resume(struct mddev *mddev); extern void md_idle_sync_thread(struct mddev *mddev); extern void md_frozen_sync_thread(struct mddev *mddev); extern void md_unfrozen_sync_thread(struct mddev *mddev); extern void md_update_sb(struct mddev *mddev, int force); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev); extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); static inline bool is_rdev_broken(struct md_rdev *rdev) { return !disk_live(rdev->bdev->bd_disk); } static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { int faulty = test_bit(Faulty, &rdev->flags); if (atomic_dec_and_test(&rdev->nr_pending) && faulty) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } } static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; } /* clear unsupported mddev_flags */ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, unsigned long unsupported_flags) { mddev->flags &= ~unsupported_flags; } static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0; } static inline int mddev_suspend_and_lock(struct mddev *mddev) { int ret; ret = mddev_suspend(mddev, true); if (ret) return ret; ret = mddev_lock(mddev); if (ret) mddev_resume(mddev); return ret; } static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev) { mddev_suspend(mddev, false); mutex_lock(&mddev->reconfig_mutex); } static inline void mddev_unlock_and_resume(struct mddev *mddev) { mddev_unlock(mddev); mddev_resume(mddev); } struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); int do_md_run(struct mddev *mddev); #define MDDEV_STACK_INTEGRITY (1u << 0) int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, unsigned int flags); int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev); void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes); extern const struct block_device_operations md_fops; /* * MD devices can be used undeneath by DM, in which case ->gendisk is NULL. */ static inline bool mddev_is_dm(struct mddev *mddev) { return !mddev->gendisk; } static inline bool raid_is_456(struct mddev *mddev) { return mddev->level == ID_RAID4 || mddev->level == ID_RAID5 || mddev->level == ID_RAID6; } static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, sector_t sector) { if (!mddev_is_dm(mddev)) trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector); } static inline bool rdev_blocked(struct md_rdev *rdev) { /* * Blocked will be set by error handler and cleared by daemon after * updating superblock, meanwhile write IO should be blocked to prevent * reading old data after power failure. */ if (test_bit(Blocked, &rdev->flags)) return true; /* * Faulty device should not be accessed anymore, there is no need to * wait for bad block to be acknowledged. */ if (test_bit(Faulty, &rdev->flags)) return false; /* rdev is blocked by badblocks. */ if (test_bit(BlockedBadBlocks, &rdev->flags)) return true; return false; } #define mddev_add_trace_msg(mddev, fmt, args...) \ do { \ if (!mddev_is_dm(mddev)) \ blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \ } while (0) #endif /* _MD_MD_H */ |
| 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2017 Red Hat, Inc */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/libps2.h> #include <linux/i2c.h> #include <linux/serio.h> #include <linux/slab.h> #include <linux/workqueue.h> #include "psmouse.h" struct psmouse_smbus_dev { struct i2c_board_info board; struct psmouse *psmouse; struct i2c_client *client; struct list_head node; bool dead; bool need_deactivate; }; static LIST_HEAD(psmouse_smbus_list); static DEFINE_MUTEX(psmouse_smbus_mutex); static struct workqueue_struct *psmouse_smbus_wq; static void psmouse_smbus_check_adapter(struct i2c_adapter *adapter) { struct psmouse_smbus_dev *smbdev; if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_HOST_NOTIFY)) return; guard(mutex)(&psmouse_smbus_mutex); list_for_each_entry(smbdev, &psmouse_smbus_list, node) { if (smbdev->dead) continue; if (smbdev->client) continue; /* * Here would be a good place to check if device is actually * present, but it seems that SMBus will not respond unless we * fully reset PS/2 connection. So cross our fingers, and try * to switch over, hopefully our system will not have too many * "host notify" I2C adapters. */ psmouse_dbg(smbdev->psmouse, "SMBus candidate adapter appeared, triggering rescan\n"); serio_rescan(smbdev->psmouse->ps2dev.serio); } } static void psmouse_smbus_detach_i2c_client(struct i2c_client *client) { struct psmouse_smbus_dev *smbdev, *tmp; guard(mutex)(&psmouse_smbus_mutex); list_for_each_entry_safe(smbdev, tmp, &psmouse_smbus_list, node) { if (smbdev->client != client) continue; kfree(client->dev.platform_data); client->dev.platform_data = NULL; if (!smbdev->dead) { psmouse_dbg(smbdev->psmouse, "Marking SMBus companion %s as gone\n", dev_name(&smbdev->client->dev)); smbdev->dead = true; device_link_remove(&smbdev->client->dev, &smbdev->psmouse->ps2dev.serio->dev); serio_rescan(smbdev->psmouse->ps2dev.serio); } else { list_del(&smbdev->node); kfree(smbdev); } } } static int psmouse_smbus_notifier_call(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; switch (action) { case BUS_NOTIFY_ADD_DEVICE: if (dev->type == &i2c_adapter_type) psmouse_smbus_check_adapter(to_i2c_adapter(dev)); break; case BUS_NOTIFY_REMOVED_DEVICE: if (dev->type == &i2c_client_type) psmouse_smbus_detach_i2c_client(to_i2c_client(dev)); break; } return 0; } static struct notifier_block psmouse_smbus_notifier = { .notifier_call = psmouse_smbus_notifier_call, }; static psmouse_ret_t psmouse_smbus_process_byte(struct psmouse *psmouse) { return PSMOUSE_FULL_PACKET; } static int psmouse_smbus_reconnect(struct psmouse *psmouse) { struct psmouse_smbus_dev *smbdev = psmouse->private; if (smbdev->need_deactivate) psmouse_deactivate(psmouse); return 0; } struct psmouse_smbus_removal_work { struct work_struct work; struct i2c_client *client; }; static void psmouse_smbus_remove_i2c_device(struct work_struct *work) { struct psmouse_smbus_removal_work *rwork = container_of(work, struct psmouse_smbus_removal_work, work); dev_dbg(&rwork->client->dev, "destroying SMBus companion device\n"); i2c_unregister_device(rwork->client); kfree(rwork); } /* * This schedules removal of SMBus companion device. We have to do * it in a separate tread to avoid deadlocking on psmouse_mutex in * case the device has a trackstick (which is also driven by psmouse). * * Note that this may be racing with i2c adapter removal, but we * can't do anything about that: i2c automatically destroys clients * attached to an adapter that is being removed. This has to be * fixed in i2c core. */ static void psmouse_smbus_schedule_remove(struct i2c_client *client) { struct psmouse_smbus_removal_work *rwork; rwork = kzalloc(sizeof(*rwork), GFP_KERNEL); if (rwork) { INIT_WORK(&rwork->work, psmouse_smbus_remove_i2c_device); rwork->client = client; queue_work(psmouse_smbus_wq, &rwork->work); } } static void psmouse_smbus_disconnect(struct psmouse *psmouse) { struct psmouse_smbus_dev *smbdev = psmouse->private; guard(mutex)(&psmouse_smbus_mutex); if (smbdev->dead) { list_del(&smbdev->node); kfree(smbdev); } else { smbdev->dead = true; device_link_remove(&smbdev->client->dev, &psmouse->ps2dev.serio->dev); psmouse_dbg(smbdev->psmouse, "posting removal request for SMBus companion %s\n", dev_name(&smbdev->client->dev)); psmouse_smbus_schedule_remove(smbdev->client); } psmouse->private = NULL; } static int psmouse_smbus_create_companion(struct device *dev, void *data) { struct psmouse_smbus_dev *smbdev = data; unsigned short addr_list[] = { smbdev->board.addr, I2C_CLIENT_END }; struct i2c_adapter *adapter; struct i2c_client *client; adapter = i2c_verify_adapter(dev); if (!adapter) return 0; if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_HOST_NOTIFY)) return 0; client = i2c_new_scanned_device(adapter, &smbdev->board, addr_list, NULL); if (IS_ERR(client)) return 0; /* We have our(?) device, stop iterating i2c bus. */ smbdev->client = client; return 1; } void psmouse_smbus_cleanup(struct psmouse *psmouse) { struct psmouse_smbus_dev *smbdev, *tmp; guard(mutex)(&psmouse_smbus_mutex); list_for_each_entry_safe(smbdev, tmp, &psmouse_smbus_list, node) { if (psmouse == smbdev->psmouse) { list_del(&smbdev->node); kfree(smbdev); } } } int psmouse_smbus_init(struct psmouse *psmouse, const struct i2c_board_info *board, const void *pdata, size_t pdata_size, bool need_deactivate, bool leave_breadcrumbs) { struct psmouse_smbus_dev *smbdev; int error; smbdev = kzalloc(sizeof(*smbdev), GFP_KERNEL); if (!smbdev) return -ENOMEM; smbdev->psmouse = psmouse; smbdev->board = *board; smbdev->need_deactivate = need_deactivate; if (pdata) { smbdev->board.platform_data = kmemdup(pdata, pdata_size, GFP_KERNEL); if (!smbdev->board.platform_data) { kfree(smbdev); return -ENOMEM; } } if (need_deactivate) psmouse_deactivate(psmouse); psmouse->private = smbdev; psmouse->protocol_handler = psmouse_smbus_process_byte; psmouse->reconnect = psmouse_smbus_reconnect; psmouse->fast_reconnect = psmouse_smbus_reconnect; psmouse->disconnect = psmouse_smbus_disconnect; psmouse->resync_time = 0; scoped_guard(mutex, &psmouse_smbus_mutex) { list_add_tail(&smbdev->node, &psmouse_smbus_list); } /* Bind to already existing adapters right away */ error = i2c_for_each_dev(smbdev, psmouse_smbus_create_companion); if (smbdev->client) { /* We have our companion device */ if (!device_link_add(&smbdev->client->dev, &psmouse->ps2dev.serio->dev, DL_FLAG_STATELESS)) psmouse_warn(psmouse, "failed to set up link with iSMBus companion %s\n", dev_name(&smbdev->client->dev)); return 0; } /* * If we did not create i2c device we will not need platform * data even if we are leaving breadcrumbs. */ kfree(smbdev->board.platform_data); smbdev->board.platform_data = NULL; if (error < 0 || !leave_breadcrumbs) { scoped_guard(mutex, &psmouse_smbus_mutex) { list_del(&smbdev->node); } kfree(smbdev); } return error < 0 ? error : -EAGAIN; } int __init psmouse_smbus_module_init(void) { int error; psmouse_smbus_wq = alloc_workqueue("psmouse-smbus", 0, 0); if (!psmouse_smbus_wq) return -ENOMEM; error = bus_register_notifier(&i2c_bus_type, &psmouse_smbus_notifier); if (error) { pr_err("failed to register i2c bus notifier: %d\n", error); destroy_workqueue(psmouse_smbus_wq); return error; } return 0; } void psmouse_smbus_module_exit(void) { bus_unregister_notifier(&i2c_bus_type, &psmouse_smbus_notifier); destroy_workqueue(psmouse_smbus_wq); } |
| 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | // SPDX-License-Identifier: GPL-2.0 /* * drivers/power/process.c - Functions for starting/stopping processes on * suspend transitions. * * Originally from swsusp. */ #include <linux/interrupt.h> #include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/delay.h> #include <linux/workqueue.h> #include <linux/kmod.h> #include <trace/events/power.h> #include <linux/cpuset.h> /* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { const char *what = user_only ? "user space processes" : "remaining freezable tasks"; struct task_struct *g, *p; unsigned long end_time; unsigned int todo; bool wq_busy = false; ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; pr_info("Freezing %s\n", what); start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); if (!user_only) freeze_workqueues_begin(); while (true) { todo = 0; read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; todo++; } read_unlock(&tasklist_lock); if (!user_only) { wq_busy = freeze_workqueues_busy(); todo += wq_busy; } if (!todo || time_after(jiffies, end_time)) break; if (pm_wakeup_pending()) { wakeup = true; break; } /* * We need to retry, but first give the freezing tasks some * time to enter the refrigerator. Start with an initial * 1 ms sleep followed by exponential backoff until 8 ms. */ usleep_range(sleep_usecs / 2, sleep_usecs); if (sleep_usecs < 8 * USEC_PER_MSEC) sleep_usecs *= 2; } end = ktime_get_boottime(); elapsed = ktime_sub(end, start); elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_err("Freezing %s %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", what, wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (wq_busy) show_freezable_workqueues(); if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p != current && freezing(p) && !frozen(p)) sched_show_task(p); } read_unlock(&tasklist_lock); } } else { pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n", what, elapsed_msecs / 1000, elapsed_msecs % 1000); } return todo ? -EBUSY : 0; } /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls * freeze_processes must later call thaw_processes. * * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_processes(void) { int error; error = __usermodehelper_disable(UMH_FREEZING); if (error) return error; /* Make sure this task doesn't get frozen */ current->flags |= PF_SUSPEND_TASK; if (!pm_freezing) static_branch_inc(&freezer_active); pm_wakeup_clear(0); pm_freezing = true; error = try_to_freeze_tasks(true); if (!error) __usermodehelper_set_disable_depth(UMH_DISABLED); BUG_ON(in_atomic()); /* * Now that the whole userspace is frozen we need to disable * the OOM killer to disallow any further interference with * killable tasks. There is no guarantee oom victims will * ever reach a point they go away we have to wait with a timeout. */ if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) error = -EBUSY; if (error) thaw_processes(); return error; } /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * * On success, returns 0. On failure, -errno and only the kernel threads are * thawed, so as to give a chance to the caller to do additional cleanups * (if any) before thawing the userspace tasks. So, it is the responsibility * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { int error; pm_nosig_freezing = true; error = try_to_freeze_tasks(false); BUG_ON(in_atomic()); if (error) thaw_kernel_threads(); return error; } void thaw_processes(void) { struct task_struct *g, *p; struct task_struct *curr = current; trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) static_branch_dec(&freezer_active); pm_freezing = false; pm_nosig_freezing = false; oom_killer_enable(); pr_info("Restarting tasks: Starting\n"); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); curr->flags &= ~PF_SUSPEND_TASK; usermodehelper_enable(); schedule(); pr_info("Restarting tasks: Done\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } void thaw_kernel_threads(void) { struct task_struct *g, *p; pm_nosig_freezing = false; pr_info("Restarting kernel threads ...\n"); thaw_workqueues(); read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p->flags & PF_KTHREAD) __thaw_task(p); } read_unlock(&tasklist_lock); schedule(); pr_info("Done restarting kernel threads.\n"); } |
| 46 61 298 44 136 139 33 247 280 29 29 29 29 29 2 2 1 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Authors: Lotsa people, from code originally in tcp */ #ifndef _INET_HASHTABLES_H #define _INET_HASHTABLES_H #include <linux/interrupt.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/wait.h> #include <net/inet_connection_sock.h> #include <net/inet_sock.h> #include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/netns/hash.h> #include <linux/refcount.h> #include <asm/byteorder.h> /* This is for all connections with a full identity, no wildcards. * The 'e' prefix stands for Establish, but we really put all sockets * but LISTEN ones. */ struct inet_ehash_bucket { struct hlist_nulls_head chain; }; /* There are a few simple rules, which allow for local port reuse by * an application. In essence: * * 1) Sockets bound to different interfaces may share a local port. * Failing that, goto test 2. * 2) If all sockets have sk->sk_reuse set, and none of them are in * TCP_LISTEN state, the port may be shared. * Failing that, goto test 3. * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local * address, and none of them are the same, the port may be * shared. * Failing this, the port cannot be shared. * * The interesting point, is test #2. This is what an FTP server does * all day. To optimize this case we use a specific flag bit defined * below. As we add sockets to a bind bucket list, we perform a * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) * As long as all sockets added to a bind bucket pass this test, * the flag bit will be set. * The resulting situation is that tcp_v[46]_verify_bind() can just check * for this flag bit, if it is set and the socket trying to bind has * sk->sk_reuse set, we don't even have to walk the owners list at all, * we return that it is ok to bind this socket to the requested local port. * * Sounds like a lot of work, but it is worth it. In a more naive * implementation (ie. current FreeBSD etc.) the entire list of ports * must be walked for each data port opened by an ftp server. Needless * to say, this does not scale at all. With a couple thousand FTP * users logged onto your box, isn't it nice to know that new data * ports are created in O(1) time? I thought so. ;-) -DaveM */ #define FASTREUSEPORT_ANY 1 #define FASTREUSEPORT_STRICT 2 struct inet_bind_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; kuid_t fastuid; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr fast_v6_rcv_saddr; #endif __be32 fast_rcv_saddr; unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; struct hlist_head bhash2; struct rcu_head rcu; }; struct inet_bind2_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) unsigned short addr_type; struct in6_addr v6_rcv_saddr; #define rcv_saddr v6_rcv_saddr.s6_addr32[3] #else __be32 rcv_saddr; #endif /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; signed char fastreuse; signed char fastreuseport; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) { return read_pnet(&ib->ib_net); } static inline struct net *ib2_net(const struct inet_bind2_bucket *ib) { return read_pnet(&ib->ib_net); } #define inet_bind_bucket_for_each(tb, head) \ hlist_for_each_entry(tb, head, node) struct inet_bind_hashbucket { spinlock_t lock; struct hlist_head chain; }; /* Sockets can be hashed in established or listening table. * We must use different 'nulls' end-of-chain value for all hash buckets : * A socket might transition from ESTABLISH to LISTEN state without * RCU grace period. A lookup in ehash table needs to handle this case. */ #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; struct hlist_nulls_head nulls_head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ struct inet_hashinfo { /* This is for sockets with full identity only. Sockets here will * always be without wildcards and will have the following invariant: * * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE * */ struct inet_ehash_bucket *ehash; spinlock_t *ehash_locks; unsigned int ehash_mask; unsigned int ehash_locks_mask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */ struct kmem_cache *bind_bucket_cachep; /* This bind table is hashed by local port */ struct inet_bind_hashbucket *bhash; struct kmem_cache *bind2_bucket_cachep; /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4) * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used * primarily for expediting bind conflict resolution. */ struct inet_bind_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; bool pernet; } ____cacheline_aligned_in_smp; static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk) { return sock_net(sk)->ipv4.tcp_death_row.hashinfo; } static inline struct inet_listen_hashbucket * inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) { return &h->lhash2[hash & h->lhash2_mask]; } static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash[hash & hashinfo->ehash_mask]; } static inline spinlock_t *inet_ehash_lockp( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; } int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); hashinfo->ehash_locks = NULL; } struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo); struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); void inet_bind_bucket_destroy(struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, int l3mdev); struct inet_bind2_bucket * inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk); void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb); struct inet_bind2_bucket * inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { return (lport + net_hash_mix(net)) & (bhash_size - 1); } static inline struct inet_bind_hashbucket * inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, const struct net *net, unsigned short port) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); else #endif hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); /* This should be called whenever a socket's sk_rcv_saddr (ipv4) or * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's * rcv_saddr field should already have been updated when this is called. */ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); void inet_bhash2_reset_saddr(struct sock *sk); void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit); int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(const struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif) { return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } /* Socket demux engine toys. */ /* What happens here is ugly; there's a pair of adjacent fields in struct inet_sock; __be16 dport followed by __u16 num. We want to search by pair, so we combine the keys into a single 32bit value and compare with 32bit value read from &...->dport. Let's at least make sure that it's not mixed with anything else... On 64bit targets we combine comparisons with pair of adjacent __be32 fields in the same way. */ #ifdef __BIG_ENDIAN #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport))) #else /* __LITTLE_ENDIAN */ #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport))) #endif #ifdef __BIG_ENDIAN #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__saddr)) << 32) | \ ((__force __u64)(__be32)(__daddr))) #else /* __LITTLE_ENDIAN */ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__daddr)) << 32) | \ ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ static inline bool inet_match(const struct net *net, const struct sock *sk, const __addrpair cookie, const __portpair ports, int dif, int sdif) { if (!net_eq(sock_net(sk), net) || sk->sk_portpair != ports || sk->sk_addrpair != cookie) return false; /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */ return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif); } /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM */ struct sock *__inet_lookup_established(const struct net *net, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); typedef u32 (inet_ehashfn_t)(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport); inet_ehashfn_t inet_ehashfn; INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn); struct sock *inet_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); static inline struct sock *inet_lookup_established(struct net *net, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { return __inet_lookup_established(net, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif, const int sdif, bool *refcounted) { u16 hnum = ntohs(dport); struct sock *sk; sk = __inet_lookup_established(net, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { struct sock *sk; bool refcounted; sk = __inet_lookup(net, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } static inline struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, bool *refcounted, inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool prefetched; sk = skb_steal_sock(skb, refcounted, &prefetched); if (!sk) return NULL; if (!prefetched || !sk_fullsock(sk)) return sk; if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_state != TCP_LISTEN) return sk; } else if (sk->sk_protocol == IPPROTO_UDP) { if (sk->sk_state != TCP_CLOSE) return sk; } else { return sk; } reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, ntohs(dport), ehashfn); if (!reuse_sk) return sk; /* We've chosen a new reuseport sock which is never refcounted. This * implies that sk also isn't refcounted. */ WARN_ON_ONCE(*refcounted); return reuse_sk; } static inline struct sock *__inet_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, const int sdif, bool *refcounted) { struct net *net = skb_dst_dev_net_rcu(skb); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport, refcounted, inet_ehashfn); if (IS_ERR(sk)) return NULL; if (sk) return sk; return __inet_lookup(net, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } static inline void sk_daddr_set(struct sock *sk, __be32 addr) { sk->sk_daddr = addr; /* alias of inet_daddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr); #endif } static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) { sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr); #endif } int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, u32 hash_port0, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **, bool rcu_lookup, u32 hash)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); #endif /* _INET_HASHTABLES_H */ |
| 3 425 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 | // SPDX-License-Identifier: GPL-2.0-only /* * xfrm_nat_keepalive.c * * (c) 2024 Eyal Birger <eyal.birger@gmail.com> */ #include <net/inet_common.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv4) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; #if IS_ENABLED(CONFIG_IPV6) static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv6) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; #endif struct nat_keepalive { struct net *net; u16 family; xfrm_address_t saddr; xfrm_address_t daddr; __be16 encap_sport; __be16 encap_dport; __u32 smark; }; static void nat_keepalive_init(struct nat_keepalive *ka, struct xfrm_state *x) { ka->net = xs_net(x); ka->family = x->props.family; ka->saddr = x->props.saddr; ka->daddr = x->id.daddr; ka->encap_sport = x->encap->encap_sport; ka->encap_dport = x->encap->encap_dport; ka->smark = xfrm_smark_get(0, x); } static int nat_keepalive_send_ipv4(struct sk_buff *skb, struct nat_keepalive *ka) { struct net *net = ka->net; struct flowi4 fl4; struct rtable *rt; struct sock *sk; __u8 tos = 0; int err; flowi4_init_output(&fl4, 0 /* oif */, skb->mark, tos, RT_SCOPE_UNIVERSE, IPPROTO_UDP, 0, ka->daddr.a4, ka->saddr.a4, ka->encap_dport, ka->encap_sport, sock_net_uid(net, NULL)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); skb_dst_set(skb, &rt->dst); local_lock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); sk = this_cpu_read(nat_keepalive_sk_ipv4.sock); sock_net_set(sk, net); err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos); sock_net_set(sk, &init_net); local_unlock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); return err; } #if IS_ENABLED(CONFIG_IPV6) static int nat_keepalive_send_ipv6(struct sk_buff *skb, struct nat_keepalive *ka, struct udphdr *uh) { struct net *net = ka->net; struct dst_entry *dst; struct flowi6 fl6; struct sock *sk; __wsum csum; int err; csum = skb_checksum(skb, 0, skb->len, 0); uh->check = csum_ipv6_magic(&ka->saddr.in6, &ka->daddr.in6, skb->len, IPPROTO_UDP, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = skb->mark; fl6.saddr = ka->saddr.in6; fl6.daddr = ka->daddr.in6; fl6.flowi6_proto = IPPROTO_UDP; fl6.fl6_sport = ka->encap_sport; fl6.fl6_dport = ka->encap_dport; local_lock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); sk = this_cpu_read(nat_keepalive_sk_ipv6.sock); sock_net_set(sk, net); dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL); if (IS_ERR(dst)) { local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); return PTR_ERR(dst); } skb_dst_set(skb, dst); err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0); sock_net_set(sk, &init_net); local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); return err; } #endif static void nat_keepalive_send(struct nat_keepalive *ka) { const int nat_ka_hdrs_len = max(sizeof(struct iphdr), sizeof(struct ipv6hdr)) + sizeof(struct udphdr); const u8 nat_ka_payload = 0xFF; int err = -EAFNOSUPPORT; struct sk_buff *skb; struct udphdr *uh; skb = alloc_skb(nat_ka_hdrs_len + sizeof(nat_ka_payload), GFP_ATOMIC); if (unlikely(!skb)) return; skb_reserve(skb, nat_ka_hdrs_len); skb_put_u8(skb, nat_ka_payload); uh = skb_push(skb, sizeof(*uh)); uh->source = ka->encap_sport; uh->dest = ka->encap_dport; uh->len = htons(skb->len); uh->check = 0; skb->mark = ka->smark; switch (ka->family) { case AF_INET: err = nat_keepalive_send_ipv4(skb, ka); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: err = nat_keepalive_send_ipv6(skb, ka, uh); break; #endif } if (err) kfree_skb(skb); } struct nat_keepalive_work_ctx { time64_t next_run; time64_t now; }; static int nat_keepalive_work_single(struct xfrm_state *x, int count, void *ptr) { struct nat_keepalive_work_ctx *ctx = ptr; bool send_keepalive = false; struct nat_keepalive ka; time64_t next_run; u32 interval; int delta; interval = x->nat_keepalive_interval; if (!interval) return 0; spin_lock(&x->lock); delta = (int)(ctx->now - x->lastused); if (delta < interval) { x->nat_keepalive_expiration = ctx->now + interval - delta; next_run = x->nat_keepalive_expiration; } else if (x->nat_keepalive_expiration > ctx->now) { next_run = x->nat_keepalive_expiration; } else { next_run = ctx->now + interval; nat_keepalive_init(&ka, x); send_keepalive = true; } spin_unlock(&x->lock); if (send_keepalive) nat_keepalive_send(&ka); if (!ctx->next_run || next_run < ctx->next_run) ctx->next_run = next_run; return 0; } static void nat_keepalive_work(struct work_struct *work) { struct nat_keepalive_work_ctx ctx; struct xfrm_state_walk walk; struct net *net; ctx.next_run = 0; ctx.now = ktime_get_real_seconds(); net = container_of(work, struct net, xfrm.nat_keepalive_work.work); xfrm_state_walk_init(&walk, IPPROTO_ESP, NULL); xfrm_state_walk(net, &walk, nat_keepalive_work_single, &ctx); xfrm_state_walk_done(&walk, net); if (ctx.next_run) schedule_delayed_work(&net->xfrm.nat_keepalive_work, (ctx.next_run - ctx.now) * HZ); } static int nat_keepalive_sk_init(struct sock_bh_locked __percpu *socks, unsigned short family) { struct sock *sk; int err, i; for_each_possible_cpu(i) { err = inet_ctl_sock_create(&sk, family, SOCK_RAW, IPPROTO_UDP, &init_net); if (err < 0) goto err; per_cpu_ptr(socks, i)->sock = sk; } return 0; err: for_each_possible_cpu(i) inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); return err; } static void nat_keepalive_sk_fini(struct sock_bh_locked __percpu *socks) { int i; for_each_possible_cpu(i) inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); } void xfrm_nat_keepalive_state_updated(struct xfrm_state *x) { struct net *net; if (!x->nat_keepalive_interval) return; net = xs_net(x); schedule_delayed_work(&net->xfrm.nat_keepalive_work, 0); } int __net_init xfrm_nat_keepalive_net_init(struct net *net) { INIT_DELAYED_WORK(&net->xfrm.nat_keepalive_work, nat_keepalive_work); return 0; } int xfrm_nat_keepalive_net_fini(struct net *net) { cancel_delayed_work_sync(&net->xfrm.nat_keepalive_work); return 0; } int xfrm_nat_keepalive_init(unsigned short family) { int err = -EAFNOSUPPORT; switch (family) { case AF_INET: err = nat_keepalive_sk_init(&nat_keepalive_sk_ipv4, PF_INET); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: err = nat_keepalive_sk_init(&nat_keepalive_sk_ipv6, PF_INET6); break; #endif } if (err) pr_err("xfrm nat keepalive init: failed to init err:%d\n", err); return err; } EXPORT_SYMBOL_GPL(xfrm_nat_keepalive_init); void xfrm_nat_keepalive_fini(unsigned short family) { switch (family) { case AF_INET: nat_keepalive_sk_fini(&nat_keepalive_sk_ipv4); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: nat_keepalive_sk_fini(&nat_keepalive_sk_ipv6); break; #endif } } EXPORT_SYMBOL_GPL(xfrm_nat_keepalive_fini); |
| 2 23 23 2 2 2 21 1 1 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> /* J1939 Address Claiming. * Address Claiming in the kernel * - keeps track of the AC states of ECU's, * - resolves NAME<=>SA taking into account the AC states of ECU's. * * All Address Claim msgs (including host-originated msg) are processed * at the receive path (a sent msg is always received again via CAN echo). * As such, the processing of AC msgs is done in the order on which msgs * are sent on the bus. * * This module doesn't send msgs itself (e.g. replies on Address Claims), * this is the responsibility of a user space application or daemon. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netdevice.h> #include <linux/skbuff.h> #include "j1939-priv.h" static inline name_t j1939_skb_to_name(const struct sk_buff *skb) { return le64_to_cpup((__le64 *)skb->data); } static inline bool j1939_ac_msg_is_request(struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); int req_pgn; if (skb->len < 3 || skcb->addr.pgn != J1939_PGN_REQUEST) return false; req_pgn = skb->data[0] | (skb->data[1] << 8) | (skb->data[2] << 16); return req_pgn == J1939_PGN_ADDRESS_CLAIMED; } static int j1939_ac_verify_outgoing(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); if (skb->len != 8) { netdev_notice(priv->ndev, "tx address claim with dlc %i\n", skb->len); return -EPROTO; } if (skcb->addr.src_name != j1939_skb_to_name(skb)) { netdev_notice(priv->ndev, "tx address claim with different name\n"); return -EPROTO; } if (skcb->addr.sa == J1939_NO_ADDR) { netdev_notice(priv->ndev, "tx address claim with broadcast sa\n"); return -EPROTO; } /* ac must always be a broadcast */ if (skcb->addr.dst_name || skcb->addr.da != J1939_NO_ADDR) { netdev_notice(priv->ndev, "tx address claim with dest, not broadcast\n"); return -EPROTO; } return 0; } int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); int ret; u8 addr; /* network mgmt: address claiming msgs */ if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) { struct j1939_ecu *ecu; ret = j1939_ac_verify_outgoing(priv, skb); /* return both when failure & when successful */ if (ret < 0) return ret; ecu = j1939_ecu_get_by_name(priv, skcb->addr.src_name); if (!ecu) return -ENODEV; if (ecu->addr != skcb->addr.sa) /* hold further traffic for ecu, remove from parent */ j1939_ecu_unmap(ecu); j1939_ecu_put(ecu); } else if (skcb->addr.src_name) { /* assign source address */ addr = j1939_name_to_addr(priv, skcb->addr.src_name); if (!j1939_address_is_unicast(addr) && !j1939_ac_msg_is_request(skb)) { netdev_notice(priv->ndev, "tx drop: invalid sa for name 0x%016llx\n", skcb->addr.src_name); return -EADDRNOTAVAIL; } skcb->addr.sa = addr; } /* assign destination address */ if (skcb->addr.dst_name) { addr = j1939_name_to_addr(priv, skcb->addr.dst_name); if (!j1939_address_is_unicast(addr)) { netdev_notice(priv->ndev, "tx drop: invalid da for name 0x%016llx\n", skcb->addr.dst_name); return -EADDRNOTAVAIL; } skcb->addr.da = addr; } return 0; } static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_ecu *ecu, *prev; name_t name; if (skb->len != 8) { netdev_notice(priv->ndev, "rx address claim with wrong dlc %i\n", skb->len); return; } name = j1939_skb_to_name(skb); skcb->addr.src_name = name; if (!name) { netdev_notice(priv->ndev, "rx address claim without name\n"); return; } if (!j1939_address_is_valid(skcb->addr.sa)) { netdev_notice(priv->ndev, "rx address claim with broadcast sa\n"); return; } write_lock_bh(&priv->lock); /* Few words on the ECU ref counting: * * First we get an ECU handle, either with * j1939_ecu_get_by_name_locked() (increments the ref counter) * or j1939_ecu_create_locked() (initializes an ECU object * with a ref counter of 1). * * j1939_ecu_unmap_locked() will decrement the ref counter, * but only if the ECU was mapped before. So "ecu" still * belongs to us. * * j1939_ecu_timer_start() will increment the ref counter * before it starts the timer, so we can put the ecu when * leaving this function. */ ecu = j1939_ecu_get_by_name_locked(priv, name); if (ecu && ecu->addr == skcb->addr.sa) { /* The ISO 11783-5 standard, in "4.5.2 - Address claim * requirements", states: * d) No CF shall begin, or resume, transmission on the * network until 250 ms after it has successfully claimed * an address except when responding to a request for * address-claimed. * * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim * prioritization" show that the CF begins the transmission * after 250 ms from the first AC (address-claimed) message * even if it sends another AC message during that time window * to resolve the address contention with another CF. * * As stated in "4.4.2.3 - Address-claimed message": * In order to successfully claim an address, the CF sending * an address claimed message shall not receive a contending * claim from another CF for at least 250 ms. * * As stated in "4.4.3.2 - NAME management (NM) message": * 1) A commanding CF can * d) request that a CF with a specified NAME transmit * the address-claimed message with its current NAME. * 2) A target CF shall * d) send an address-claimed message in response to a * request for a matching NAME * * Taking the above arguments into account, the 250 ms wait is * requested only during network initialization. * * Do not restart the timer on AC message if both the NAME and * the address match and so if the address has already been * claimed (timer has expired) or the AC message has been sent * to resolve the contention with another CF (timer is still * running). */ goto out_ecu_put; } if (!ecu && j1939_address_is_unicast(skcb->addr.sa)) ecu = j1939_ecu_create_locked(priv, name); if (IS_ERR_OR_NULL(ecu)) goto out_unlock_bh; /* cancel pending (previous) address claim */ j1939_ecu_timer_cancel(ecu); if (j1939_address_is_idle(skcb->addr.sa)) { j1939_ecu_unmap_locked(ecu); goto out_ecu_put; } /* save new addr */ if (ecu->addr != skcb->addr.sa) j1939_ecu_unmap_locked(ecu); ecu->addr = skcb->addr.sa; prev = j1939_ecu_get_by_addr_locked(priv, skcb->addr.sa); if (prev) { if (ecu->name > prev->name) { j1939_ecu_unmap_locked(ecu); j1939_ecu_put(prev); goto out_ecu_put; } else { /* kick prev if less or equal */ j1939_ecu_unmap_locked(prev); j1939_ecu_put(prev); } } j1939_ecu_timer_start(ecu); out_ecu_put: j1939_ecu_put(ecu); out_unlock_bh: write_unlock_bh(&priv->lock); } void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_ecu *ecu; /* network mgmt */ if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) { j1939_ac_process(priv, skb); } else if (j1939_address_is_unicast(skcb->addr.sa)) { /* assign source name */ ecu = j1939_ecu_get_by_addr(priv, skcb->addr.sa); if (ecu) { skcb->addr.src_name = ecu->name; j1939_ecu_put(ecu); } } /* assign destination name */ ecu = j1939_ecu_get_by_addr(priv, skcb->addr.da); if (ecu) { skcb->addr.dst_name = ecu->name; j1939_ecu_put(ecu); } } |
| 11 18 17 11 11 11 7 18 1 1 1 419 419 423 442 442 442 442 442 442 442 442 442 442 442 442 442 419 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder. * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 * * Fixes: * Alan Cox : Merged and made usable non modular (its so tiny its silly as * a module taking up 2 pages). * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) * to keep ip_forward happy. * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL * David Woodhouse : Perform some basic ICMP handling. * IPIP Routing without decapsulation. * Carlos Picoto : GRE over IP support * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. * I do not want to merge them together. */ /* tunnel.c: an IP tunnel driver The purpose of this driver is to provide an IP tunnel through which you can tunnel network traffic transparently across subnets. This was written by looking at Nick Holloway's dummy driver Thanks for the great code! -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 Minor tweaks: Cleaned up the code a little and added some pre-1.3.0 tweaks. dev->hard_header/hard_header_len changed to use no headers. Comments/bracketing tweaked. Made the tunnels use dev->name not tunnel: when error reporting. Added tx_dropped stat -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95 Reworked: Changed to tunnel to destination gateway in addition to the tunnel's pointopoint address Almost completely rewritten Note: There is currently no firewall or ICMP handling done. -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 */ /* Things I wish I had known when writing the tunnel driver: When the tunnel_xmit() function is called, the skb contains the packet to be sent (plus a great deal of extra info), and dev contains the tunnel device that _we_ are. When we are passed a packet, we are expected to fill in the source address with our source IP address. What is the proper way to allocate, copy and free a buffer? After you allocate it, it is a "0 length" chunk of memory starting at zero. If you want to add headers to the buffer later, you'll have to call "skb_reserve(skb, amount)" with the amount of memory you want reserved. Then, you call "skb_put(skb, amount)" with the amount of space you want in the buffer. skb_put() returns a pointer to the top (#0) of that buffer. skb->len is set to the amount of space you have "allocated" with skb_put(). You can then write up to skb->len bytes to that buffer. If you need more, you can call skb_put() again with the additional amount of space you need. You can find out how much more space you can allocate by calling "skb_tailroom(skb)". Now, to add header space, call "skb_push(skb, header_len)". This creates space at the beginning of the buffer and returns a pointer to this new space. If later you need to strip a header from a buffer, call "skb_pull(skb, header_len)". skb_headroom() will return how much space is left at the top of the buffer (before the main data). Remember, this headroom space must be reserved before the skb_put() function is called. */ /* This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c For comments look at net/ipv4/ip_gre.c --ANK */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/dst_metadata.h> static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static unsigned int ipip_net_id __read_mostly; static int ipip_tunnel_init(struct net_device *dev); static struct rtnl_link_ops ipip_link_ops __read_mostly; static int ipip_err(struct sk_buff *skb, u32 info) { /* All the routers (except for Linux) return only * 8 bytes of packet payload. It means, that precise relaying of * ICMP in the real Internet is absolutely infeasible. */ struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; IP_TUNNEL_DECLARE_FLAGS(flags) = { }; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err = 0; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!t) { err = -ENOENT; goto out; } switch (type) { case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: /* Impossible event. */ goto out; default: /* All others are translated to HOST_UNREACH. * rfc2003 contains "deep thoughts" about NET_UNREACH, * I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) goto out; break; case ICMP_REDIRECT: break; default: goto out; } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol); goto out; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, net, t->parms.link, iph->protocol); goto out; } if (t->parms.iph.daddr == 0) { err = -ENOENT; goto out; } if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; out: return err; } static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; #if IS_ENABLED(CONFIG_MPLS) static const struct tnl_ptk_info mplsip_tpi = { /* no tunnel info required for mplsip. */ .proto = htons(ETH_P_MPLS_UC), }; #endif static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst = NULL; struct ip_tunnel *tunnel; const struct iphdr *iph; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { const struct tnl_ptk_info *tpi; if (tunnel->parms.iph.protocol != ipproto && tunnel->parms.iph.protocol != 0) goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; #if IS_ENABLED(CONFIG_MPLS) if (ipproto == IPPROTO_MPLS) tpi = &mplsip_tpi; else #endif tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (tunnel->collect_md) { ip_tunnel_flags_zero(flags); tun_dst = ip_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) return 0; ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info); } skb_reset_mac_header(skb); return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } return -1; drop: kfree_skb(skb); return 0; } static int ipip_rcv(struct sk_buff *skb) { return ipip_tunnel_rcv(skb, IPPROTO_IPIP); } #if IS_ENABLED(CONFIG_MPLS) static int mplsip_rcv(struct sk_buff *skb) { return ipip_tunnel_rcv(skb, IPPROTO_MPLS); } #endif /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; u8 ipproto; if (!pskb_inet_may_pull(skb)) goto tx_error; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; #if IS_ENABLED(CONFIG_MPLS) case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; #endif default: goto tx_error; } if (tiph->protocol != ipproto && tiph->protocol != 0) goto tx_error; if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; skb_set_inner_ipproto(skb, ipproto); if (tunnel->collect_md) ip_md_tunnel_xmit(skb, dev, ipproto, 0); else ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: kfree_skb(skb); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) { switch (ipproto) { case 0: case IPPROTO_IPIP: #if IS_ENABLED(CONFIG_MPLS) case IPPROTO_MPLS: #endif return true; } return false; } static int ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; } p->i_key = p->o_key = 0; ip_tunnel_flags_zero(p->i_flags); ip_tunnel_flags_zero(p->o_flags); return ip_tunnel_ctl(dev, p, cmd); } static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, }; #define IPIP_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; dev->addr_len = 4; dev->lltx = true; netif_keep_dst(dev); dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; ip_tunnel_setup(dev, ipip_net_id); } static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); __dev_addr_set(dev, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); tunnel->tun_hlen = 0; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; return ip_tunnel_init(dev); } static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0) return -EINVAL; return 0; } static void ipip_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms, bool *collect_md, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.version = 4; parms->iph.protocol = IPPROTO_IPIP; parms->iph.ihl = 5; *collect_md = false; if (!data) return; ip_tunnel_netlink_parms(data, parms); if (data[IFLA_IPTUN_COLLECT_METADATA]) *collect_md = true; if (data[IFLA_IPTUN_FWMARK]) *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ipip_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip_netlink_parms(data, &p, &t->collect_md, &fwmark); return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, fwmark); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; bool collect_md; __u32 fwmark = t->fwmark; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip_netlink_parms(data, &p, &collect_md, &fwmark); if (collect_md) return -EINVAL; if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) return -EINVAL; return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t ipip_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(4) + /* IFLA_IPTUN_REMOTE */ nla_total_size(4) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_TOS */ nla_total_size(1) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (tunnel->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { .kind = "ipip", .maxtype = IFLA_IPTUN_MAX, .policy = ipip_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipip_tunnel_setup, .validate = ipip_tunnel_validate, .newlink = ipip_newlink, .changelink = ipip_changelink, .dellink = ip_tunnel_dellink, .get_size = ipip_get_size, .fill_info = ipip_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, }; #if IS_ENABLED(CONFIG_MPLS) static struct xfrm_tunnel mplsip_handler __read_mostly = { .handler = mplsip_rcv, .err_handler = ipip_err, .priority = 1, }; #endif static int __net_init ipip_init_net(struct net *net) { return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } static void __net_exit ipip_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, .exit_rtnl = ipip_exit_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; static int __init ipip_init(void) { int err; pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&ipip_net_ops); if (err < 0) return err; err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { pr_info("%s: can't register tunnel\n", __func__); goto xfrm_tunnel_ipip_failed; } #if IS_ENABLED(CONFIG_MPLS) err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); if (err < 0) { pr_info("%s: can't register tunnel\n", __func__); goto xfrm_tunnel_mplsip_failed; } #endif err = rtnl_link_register(&ipip_link_ops); if (err < 0) goto rtnl_link_failed; out: return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); xfrm_tunnel_mplsip_failed: #endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: unregister_pernet_device(&ipip_net_ops); goto out; } static void __exit ipip_fini(void) { rtnl_link_unregister(&ipip_link_ops); if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) pr_info("%s: can't deregister tunnel\n", __func__); #if IS_ENABLED(CONFIG_MPLS) if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS)) pr_info("%s: can't deregister tunnel\n", __func__); #endif unregister_pernet_device(&ipip_net_ops); } module_init(ipip_init); module_exit(ipip_fini); MODULE_DESCRIPTION("IP/IP protocol decoder library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0"); |
| 16 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */ /****************************************************************************** * * Name: aclinuxex.h - Extra OS specific defines, etc. for Linux * * Copyright (C) 2000 - 2025, Intel Corp. * *****************************************************************************/ #ifndef __ACLINUXEX_H__ #define __ACLINUXEX_H__ #ifdef __KERNEL__ #ifndef ACPI_USE_NATIVE_DIVIDE #ifndef ACPI_DIV_64_BY_32 #define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \ do { \ u64 (__n) = ((u64) n_hi) << 32 | (n_lo); \ (r32) = do_div ((__n), (d32)); \ (q32) = (u32) (__n); \ } while (0) #endif #ifndef ACPI_SHIFT_RIGHT_64 #define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \ do { \ (n_lo) >>= 1; \ (n_lo) |= (((n_hi) & 1) << 31); \ (n_hi) >>= 1; \ } while (0) #endif #endif /* * Overrides for in-kernel ACPICA */ acpi_status ACPI_INIT_FUNCTION acpi_os_initialize(void); acpi_status acpi_os_terminate(void); /* * The irqs_disabled() check is for resume from RAM. * Interrupts are off during resume, just like they are for boot. * However, boot has (system_state != SYSTEM_RUNNING) * to quiet __might_sleep() in kmalloc() and resume does not. * * These specialized allocators have to be macros for their allocations to be * accounted separately (to have separate alloc_tag). */ #define acpi_os_allocate(_size) \ kmalloc(_size, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL) #define acpi_os_allocate_zeroed(_size) \ kzalloc(_size, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL) #define acpi_os_acquire_object(_cache) \ kmem_cache_zalloc(_cache, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL) static inline void acpi_os_free(void *memory) { kfree(memory); } static inline acpi_thread_id acpi_os_get_thread_id(void) { return (acpi_thread_id) (unsigned long)current; } /* * When lockdep is enabled, the spin_lock_init() macro stringifies it's * argument and uses that as a name for the lock in debugging. * By executing spin_lock_init() in a macro the key changes from "lock" for * all locks to the name of the argument of acpi_os_create_lock(), which * prevents lockdep from reporting false positives for ACPICA locks. */ #define acpi_os_create_lock(__handle) \ ({ \ spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \ if (lock) { \ *(__handle) = lock; \ spin_lock_init(*(__handle)); \ } \ lock ? AE_OK : AE_NO_MEMORY; \ }) #define acpi_os_create_raw_lock(__handle) \ ({ \ raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \ if (lock) { \ *(__handle) = lock; \ raw_spin_lock_init(*(__handle)); \ } \ lock ? AE_OK : AE_NO_MEMORY; \ }) static inline acpi_cpu_flags acpi_os_acquire_raw_lock(acpi_raw_spinlock lockp) { acpi_cpu_flags flags; raw_spin_lock_irqsave(lockp, flags); return flags; } static inline void acpi_os_release_raw_lock(acpi_raw_spinlock lockp, acpi_cpu_flags flags) { raw_spin_unlock_irqrestore(lockp, flags); } static inline void acpi_os_delete_raw_lock(acpi_raw_spinlock handle) { ACPI_FREE(handle); } static inline u8 acpi_os_readable(void *pointer, acpi_size length) { return TRUE; } static inline acpi_status acpi_os_initialize_debugger(void) { return AE_OK; } static inline void acpi_os_terminate_debugger(void) { return; } /* * OSL interfaces added by Linux */ #endif /* __KERNEL__ */ #endif /* __ACLINUXEX_H__ */ |
| 8 8 6 5 5 1 5 4 4 3 2 2 2 10 1 9 1 8 8 8 8 8 2 8 10 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 | // SPDX-License-Identifier: GPL-2.0 /* * uprobes-based tracing events * * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> */ #define pr_fmt(fmt) "trace_uprobe: " fmt #include <linux/bpf-cgroup.h> #include <linux/cleanup.h> #include <linux/ctype.h> #include <linux/filter.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/percpu.h> #include <linux/rculist.h> #include <linux/security.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/uprobes.h> #include "trace.h" #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" #define UPROBE_EVENT_SYSTEM "uprobes" struct uprobe_trace_entry_head { struct trace_entry ent; unsigned long vaddr[]; }; #define SIZEOF_TRACE_ENTRY(is_return) \ (sizeof(struct uprobe_trace_entry_head) + \ sizeof(unsigned long) * (is_return ? 2 : 1)) #define DATAOF_TRACE_ENTRY(entry, is_return) \ ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) static int trace_uprobe_create(const char *raw_command); static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_uprobe_release(struct dyn_event *ev); static bool trace_uprobe_is_busy(struct dyn_event *ev); static bool trace_uprobe_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations trace_uprobe_ops = { .create = trace_uprobe_create, .show = trace_uprobe_show, .is_busy = trace_uprobe_is_busy, .free = trace_uprobe_release, .match = trace_uprobe_match, }; /* * uprobe event core functions */ struct trace_uprobe { struct dyn_event devent; struct uprobe_consumer consumer; struct path path; char *filename; struct uprobe *uprobe; unsigned long offset; unsigned long ref_ctr_offset; unsigned long __percpu *nhits; struct trace_probe tp; }; static bool is_trace_uprobe(struct dyn_event *ev) { return ev->ops == &trace_uprobe_ops; } static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) { return container_of(ev, struct trace_uprobe, devent); } /** * for_each_trace_uprobe - iterate over the trace_uprobe list * @pos: the struct trace_uprobe * for each entry * @dpos: the struct dyn_event * to use as a loop cursor */ #define for_each_trace_uprobe(pos, dpos) \ for_each_dyn_event(dpos) \ if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos))) static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, __u64 *data); static int uretprobe_dispatcher(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs, __u64 *data); #ifdef CONFIG_STACK_GROWSUP static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) { return addr - (n * sizeof(long)); } #else static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) { return addr + (n * sizeof(long)); } #endif static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) { unsigned long ret; unsigned long addr = user_stack_pointer(regs); addr = adjust_stack_addr(addr, n); if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret))) return 0; return ret; } /* * Uprobes-specific fetch functions */ static nokprobe_inline int probe_mem_read(void *dest, void *src, size_t size) { void __user *vaddr = (void __force __user *)src; return copy_from_user(dest, vaddr, size) ? -EFAULT : 0; } static nokprobe_inline int probe_mem_read_user(void *dest, void *src, size_t size) { return probe_mem_read(dest, src, size); } /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { long ret; u32 loc = *(u32 *)dest; int maxlen = get_loc_len(loc); u8 *dst = get_loc_data(dest, base); void __user *src = (void __force __user *) addr; if (unlikely(!maxlen)) return -ENOMEM; if (addr == FETCH_TOKEN_COMM) ret = strscpy(dst, current->comm, maxlen); else ret = strncpy_from_user(dst, src, maxlen); if (ret >= 0) { if (ret == maxlen) dst[ret - 1] = '\0'; else /* * Include the terminating null byte. In this case it * was copied by strncpy_from_user but not accounted * for in ret. */ ret++; *(u32 *)dest = make_data_loc(ret, (void *)dst - base); } else *(u32 *)dest = make_data_loc(0, (void *)dst - base); return ret; } static nokprobe_inline int fetch_store_string_user(unsigned long addr, void *dest, void *base) { return fetch_store_string(addr, dest, base); } /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) { int len; void __user *vaddr = (void __force __user *) addr; if (addr == FETCH_TOKEN_COMM) len = strlen(current->comm) + 1; else len = strnlen_user(vaddr, MAX_STRING_SIZE); return (len > MAX_STRING_SIZE) ? 0 : len; } static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { return fetch_store_strlen(addr); } static unsigned long translate_user_vaddr(unsigned long file_offset) { unsigned long base_addr; struct uprobe_dispatch_data *udd; udd = (void *) current->utask->vaddr; base_addr = udd->bp_addr - udd->tu->offset; return base_addr + file_offset; } /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, void *dest, void *base) { struct pt_regs *regs = rec; unsigned long val; int ret; /* 1st stage: get value from context */ switch (code->op) { case FETCH_OP_REG: val = regs_get_register(regs, code->param); break; case FETCH_OP_STACK: val = get_user_stack_nth(regs, code->param); break; case FETCH_OP_STACKP: val = user_stack_pointer(regs); break; case FETCH_OP_RETVAL: val = regs_return_value(regs); break; case FETCH_OP_COMM: val = FETCH_TOKEN_COMM; break; case FETCH_OP_FOFFS: val = translate_user_vaddr(code->immediate); break; default: ret = process_common_fetch_insn(code, &val); if (ret < 0) return ret; } code++; return process_fetch_insn_bottom(code, val, dest, base); } NOKPROBE_SYMBOL(process_fetch_insn) static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { rwlock_init(&filter->rwlock); filter->nr_systemwide = 0; INIT_LIST_HEAD(&filter->perf_events); } static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) { return !filter->nr_systemwide && list_empty(&filter->perf_events); } static inline bool is_ret_probe(struct trace_uprobe *tu) { return tu->consumer.ret_handler != NULL; } static bool trace_uprobe_is_busy(struct dyn_event *ev) { struct trace_uprobe *tu = to_trace_uprobe(ev); return trace_probe_is_enabled(&tu->tp); } static bool trace_uprobe_match_command_head(struct trace_uprobe *tu, int argc, const char **argv) { char buf[MAX_ARGSTR_LEN + 1]; int len; if (!argc) return true; len = strlen(tu->filename); if (strncmp(tu->filename, argv[0], len) || argv[0][len] != ':') return false; if (tu->ref_ctr_offset == 0) snprintf(buf, sizeof(buf), "0x%0*lx", (int)(sizeof(void *) * 2), tu->offset); else snprintf(buf, sizeof(buf), "0x%0*lx(0x%lx)", (int)(sizeof(void *) * 2), tu->offset, tu->ref_ctr_offset); if (strcmp(buf, &argv[0][len + 1])) return false; argc--; argv++; return trace_probe_match_command_args(&tu->tp, argc, argv); } static bool trace_uprobe_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev) { struct trace_uprobe *tu = to_trace_uprobe(ev); return (event[0] == '\0' || strcmp(trace_probe_name(&tu->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && trace_uprobe_match_command_head(tu, argc, argv); } static nokprobe_inline struct trace_uprobe * trace_uprobe_primary_from_call(struct trace_event_call *call) { struct trace_probe *tp; tp = trace_probe_primary_from_call(call); if (WARN_ON_ONCE(!tp)) return NULL; return container_of(tp, struct trace_uprobe, tp); } /* * Allocate new trace_uprobe and initialize it (including uprobes). */ static struct trace_uprobe * alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; int ret; tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL); if (!tu) return ERR_PTR(-ENOMEM); tu->nhits = alloc_percpu(unsigned long); if (!tu->nhits) { ret = -ENOMEM; goto error; } ret = trace_probe_init(&tu->tp, event, group, true, nargs); if (ret < 0) goto error; dyn_event_init(&tu->devent, &trace_uprobe_ops); tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(tu->tp.event->filter); return tu; error: free_percpu(tu->nhits); kfree(tu); return ERR_PTR(ret); } static void free_trace_uprobe(struct trace_uprobe *tu) { if (!tu) return; path_put(&tu->path); trace_probe_cleanup(&tu->tp); kfree(tu->filename); free_percpu(tu->nhits); kfree(tu); } static struct trace_uprobe *find_probe_event(const char *event, const char *group) { struct dyn_event *pos; struct trace_uprobe *tu; for_each_trace_uprobe(tu, pos) if (strcmp(trace_probe_name(&tu->tp), event) == 0 && strcmp(trace_probe_group_name(&tu->tp), group) == 0) return tu; return NULL; } /* Unregister a trace_uprobe and probe_event */ static int unregister_trace_uprobe(struct trace_uprobe *tu) { int ret; if (trace_probe_has_sibling(&tu->tp)) goto unreg; /* If there's a reference to the dynamic event */ if (trace_event_dyn_busy(trace_probe_event_call(&tu->tp))) return -EBUSY; ret = unregister_uprobe_event(tu); if (ret) return ret; unreg: dyn_event_remove(&tu->devent); trace_probe_unlink(&tu->tp); free_trace_uprobe(tu); return 0; } static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, struct trace_uprobe *comp) { struct trace_probe_event *tpe = orig->tp.event; struct inode *comp_inode = d_real_inode(comp->path.dentry); int i; list_for_each_entry(orig, &tpe->probes, tp.list) { if (comp_inode != d_real_inode(orig->path.dentry) || comp->offset != orig->offset) continue; /* * trace_probe_compare_arg_type() ensured that nr_args and * each argument name and type are same. Let's compare comm. */ for (i = 0; i < orig->tp.nr_args; i++) { if (strcmp(orig->tp.args[i].comm, comp->tp.args[i].comm)) break; } if (i == orig->tp.nr_args) return true; } return false; } static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) { int ret; ret = trace_probe_compare_arg_type(&tu->tp, &to->tp); if (ret) { /* Note that argument starts index = 2 */ trace_probe_log_set_index(ret + 1); trace_probe_log_err(0, DIFF_ARG_TYPE); return -EEXIST; } if (trace_uprobe_has_same_uprobe(to, tu)) { trace_probe_log_set_index(0); trace_probe_log_err(0, SAME_PROBE); return -EEXIST; } /* Append to existing event */ ret = trace_probe_append(&tu->tp, &to->tp); if (!ret) dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); return ret; } /* * Uprobe with multiple reference counter is not allowed. i.e. * If inode and offset matches, reference counter offset *must* * match as well. Though, there is one exception: If user is * replacing old trace_uprobe with new one(same group/event), * then we allow same uprobe with new reference counter as far * as the new one does not conflict with any other existing * ones. */ static int validate_ref_ctr_offset(struct trace_uprobe *new) { struct dyn_event *pos; struct trace_uprobe *tmp; struct inode *new_inode = d_real_inode(new->path.dentry); for_each_trace_uprobe(tmp, pos) { if (new_inode == d_real_inode(tmp->path.dentry) && new->offset == tmp->offset && new->ref_ctr_offset != tmp->ref_ctr_offset) { pr_warn("Reference counter offset mismatch."); return -EINVAL; } } return 0; } /* Register a trace_uprobe and probe_event */ static int register_trace_uprobe(struct trace_uprobe *tu) { struct trace_uprobe *old_tu; int ret; guard(mutex)(&event_mutex); ret = validate_ref_ctr_offset(tu); if (ret) return ret; /* register as an event */ old_tu = find_probe_event(trace_probe_name(&tu->tp), trace_probe_group_name(&tu->tp)); if (old_tu) { if (is_ret_probe(tu) != is_ret_probe(old_tu)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); return -EEXIST; } return append_trace_uprobe(tu, old_tu); } ret = register_uprobe_event(tu); if (ret) { if (ret == -EEXIST) { trace_probe_log_set_index(0); trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); return ret; } dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); return ret; } /* * Argument syntax: * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int __trace_uprobe_create(int argc, const char **argv) { const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; unsigned long offset, ref_ctr_offset; char *gbuf __free(kfree) = NULL; char *buf __free(kfree) = NULL; enum probe_print_type ptype; struct trace_uprobe *tu; bool is_return = false; struct path path; int i, ret; ref_ctr_offset = 0; switch (argv[0][0]) { case 'r': is_return = true; break; case 'p': break; default: return -ECANCELED; } if (argc < 2) return -ECANCELED; trace_probe_log_init("trace_uprobe", argc, argv); if (argc - 2 > MAX_TRACE_ARGS) { trace_probe_log_set_index(2); trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; } if (argv[0][1] == ':') event = &argv[0][2]; if (!strchr(argv[1], '/')) return -ECANCELED; filename = kstrdup(argv[1], GFP_KERNEL); if (!filename) return -ENOMEM; /* Find the last occurrence, in case the path contains ':' too. */ arg = strrchr(filename, ':'); if (!arg || !isdigit(arg[1])) { kfree(filename); return -ECANCELED; } trace_probe_log_set_index(1); /* filename is the 2nd argument */ *arg++ = '\0'; ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) { trace_probe_log_err(0, FILE_NOT_FOUND); kfree(filename); trace_probe_log_clear(); return ret; } if (!d_is_reg(path.dentry)) { trace_probe_log_err(0, NO_REGULAR_FILE); ret = -EINVAL; goto fail_address_parse; } /* Parse reference counter offset if specified. */ rctr = strchr(arg, '('); if (rctr) { rctr_end = strchr(rctr, ')'); if (!rctr_end) { ret = -EINVAL; rctr_end = rctr + strlen(rctr); trace_probe_log_err(rctr_end - filename, REFCNT_OPEN_BRACE); goto fail_address_parse; } else if (rctr_end[1] != '\0') { ret = -EINVAL; trace_probe_log_err(rctr_end + 1 - filename, BAD_REFCNT_SUFFIX); goto fail_address_parse; } *rctr++ = '\0'; *rctr_end = '\0'; ret = kstrtoul(rctr, 0, &ref_ctr_offset); if (ret) { trace_probe_log_err(rctr - filename, BAD_REFCNT); goto fail_address_parse; } } /* Check if there is %return suffix */ tmp = strchr(arg, '%'); if (tmp) { if (!strcmp(tmp, "%return")) { *tmp = '\0'; is_return = true; } else { trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX); ret = -EINVAL; goto fail_address_parse; } } /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); if (ret) { trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); goto fail_address_parse; } /* setup a probe */ trace_probe_log_set_index(0); if (event) { gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); if (!gbuf) goto fail_mem; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto fail_address_parse; } if (!event) { char *tail; char *ptr; tail = kstrdup(kbasename(filename), GFP_KERNEL); if (!tail) goto fail_mem; ptr = strpbrk(tail, ".-_"); if (ptr) *ptr = '\0'; buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); if (!buf) goto fail_mem; snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); event = buf; kfree(tail); } argc -= 2; argv += 2; tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { ret = PTR_ERR(tu); /* This must return -ENOMEM otherwise there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); goto fail_address_parse; } tu->offset = offset; tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; tu->filename = filename; /* parse arguments */ for (i = 0; i < argc; i++) { struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { ret = -ENOMEM; goto error; } ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx); if (ret) goto error; } ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; ret = traceprobe_set_print_fmt(&tu->tp, ptype); if (ret < 0) goto error; ret = register_trace_uprobe(tu); if (!ret) goto out; error: free_trace_uprobe(tu); out: trace_probe_log_clear(); return ret; fail_mem: ret = -ENOMEM; fail_address_parse: trace_probe_log_clear(); path_put(&path); kfree(filename); return ret; } int trace_uprobe_create(const char *raw_command) { return trace_probe_create(raw_command, __trace_uprobe_create); } static int create_or_delete_trace_uprobe(const char *raw_command) { int ret; if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } static int trace_uprobe_release(struct dyn_event *ev) { struct trace_uprobe *tu = to_trace_uprobe(ev); return unregister_trace_uprobe(tu); } /* Probes listing interfaces */ static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev) { struct trace_uprobe *tu = to_trace_uprobe(ev); char c = is_ret_probe(tu) ? 'r' : 'p'; int i; seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, trace_probe_group_name(&tu->tp), trace_probe_name(&tu->tp), tu->filename, (int)(sizeof(void *) * 2), tu->offset); if (tu->ref_ctr_offset) seq_printf(m, "(0x%lx)", tu->ref_ctr_offset); for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); seq_putc(m, '\n'); return 0; } static int probes_seq_show(struct seq_file *m, void *v) { struct dyn_event *ev = v; if (!is_trace_uprobe(ev)) return 0; return trace_uprobe_show(m, ev); } static const struct seq_operations probes_seq_op = { .start = dyn_event_seq_start, .next = dyn_event_seq_next, .stop = dyn_event_seq_stop, .show = probes_seq_show }; static int probes_open(struct inode *inode, struct file *file) { int ret; ret = security_locked_down(LOCKDOWN_TRACEFS); if (ret) return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(&trace_uprobe_ops); if (ret) return ret; } return seq_open(file, &probes_seq_op); } static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { return trace_parse_run_command(file, buffer, count, ppos, create_or_delete_trace_uprobe); } static const struct file_operations uprobe_events_ops = { .owner = THIS_MODULE, .open = probes_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, .write = probes_write, }; /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct dyn_event *ev = v; struct trace_uprobe *tu; unsigned long nhits; int cpu; if (!is_trace_uprobe(ev)) return 0; tu = to_trace_uprobe(ev); nhits = 0; for_each_possible_cpu(cpu) { nhits += per_cpu(*tu->nhits, cpu); } seq_printf(m, " %s %-44s %15lu\n", tu->filename, trace_probe_name(&tu->tp), nhits); return 0; } static const struct seq_operations profile_seq_op = { .start = dyn_event_seq_start, .next = dyn_event_seq_next, .stop = dyn_event_seq_stop, .show = probes_profile_seq_show }; static int profile_open(struct inode *inode, struct file *file) { int ret; ret = security_locked_down(LOCKDOWN_TRACEFS); if (ret) return ret; return seq_open(file, &profile_seq_op); } static const struct file_operations uprobe_profile_ops = { .owner = THIS_MODULE, .open = profile_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; struct uprobe_cpu_buffer { struct mutex mutex; void *buf; int dsize; }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; #define MAX_UCB_BUFFER_SIZE PAGE_SIZE static int uprobe_buffer_init(void) { int cpu, err_cpu; uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer); if (uprobe_cpu_buffer == NULL) return -ENOMEM; for_each_possible_cpu(cpu) { struct page *p = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); if (p == NULL) { err_cpu = cpu; goto err; } per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p); mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex); } return 0; err: for_each_possible_cpu(cpu) { if (cpu == err_cpu) break; free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf); } free_percpu(uprobe_cpu_buffer); return -ENOMEM; } static int uprobe_buffer_enable(void) { int ret = 0; BUG_ON(!mutex_is_locked(&event_mutex)); if (uprobe_buffer_refcnt++ == 0) { ret = uprobe_buffer_init(); if (ret < 0) uprobe_buffer_refcnt--; } return ret; } static void uprobe_buffer_disable(void) { int cpu; BUG_ON(!mutex_is_locked(&event_mutex)); if (--uprobe_buffer_refcnt == 0) { for_each_possible_cpu(cpu) free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf); free_percpu(uprobe_cpu_buffer); uprobe_cpu_buffer = NULL; } } static struct uprobe_cpu_buffer *uprobe_buffer_get(void) { struct uprobe_cpu_buffer *ucb; int cpu; cpu = raw_smp_processor_id(); ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu); /* * Use per-cpu buffers for fastest access, but we might migrate * so the mutex makes sure we have sole access to it. */ mutex_lock(&ucb->mutex); return ucb; } static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) { if (!ucb) return; mutex_unlock(&ucb->mutex); } static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { struct uprobe_cpu_buffer *ucb; int dsize, esize; if (*ucbp) return *ucbp; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); dsize = __get_data_size(&tu->tp, regs, NULL); ucb = uprobe_buffer_get(); ucb->dsize = tu->tp.size + dsize; if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) { ucb->dsize = MAX_UCB_BUFFER_SIZE; dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; } store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); *ucbp = ucb; return ucb; } static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer *ucb, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; struct trace_event_buffer fbuffer; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); WARN_ON(call != trace_file->event_call); if (trace_trigger_soft_disabled(trace_file)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + ucb->dsize; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) return; if (is_ret_probe(tu)) { entry->vaddr[0] = func; entry->vaddr[1] = instruction_pointer(regs); data = DATAOF_TRACE_ENTRY(entry, true); } else { entry->vaddr[0] = instruction_pointer(regs); data = DATAOF_TRACE_ENTRY(entry, false); } memcpy(data, ucb->buf, ucb->dsize); trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; struct uprobe_cpu_buffer *ucb; if (is_ret_probe(tu)) return 0; ucb = prepare_uprobe_buffer(tu, regs, ucbp); rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) __uprobe_trace_func(tu, 0, regs, ucb, link->file); rcu_read_unlock(); return 0; } static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; struct uprobe_cpu_buffer *ucb; ucb = prepare_uprobe_buffer(tu, regs, ucbp); rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) __uprobe_trace_func(tu, func, regs, ucb, link->file); rcu_read_unlock(); } /* Event entry printers */ static enum print_line_t print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { struct uprobe_trace_entry_head *entry; struct trace_seq *s = &iter->seq; struct trace_uprobe *tu; u8 *data; entry = (struct uprobe_trace_entry_head *)iter->ent; tu = trace_uprobe_primary_from_call( container_of(event, struct trace_event_call, event)); if (unlikely(!tu)) goto out; if (is_ret_probe(tu)) { trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", trace_probe_name(&tu->tp), entry->vaddr[1], entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, true); } else { trace_seq_printf(s, "%s: (0x%lx)", trace_probe_name(&tu->tp), entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, false); } if (trace_probe_print_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0) goto out; trace_seq_putc(s, '\n'); out: return trace_handle_return(s); } typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm); static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) { struct inode *inode = d_real_inode(tu->path.dentry); struct uprobe *uprobe; tu->consumer.filter = filter; uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); if (IS_ERR(uprobe)) return PTR_ERR(uprobe); tu->uprobe = uprobe; return 0; } static void __probe_event_disable(struct trace_probe *tp) { struct trace_uprobe *tu; bool sync = false; tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { if (!tu->uprobe) continue; uprobe_unregister_nosync(tu->uprobe, &tu->consumer); sync = true; tu->uprobe = NULL; } if (sync) uprobe_unregister_sync(); } static int probe_event_enable(struct trace_event_call *call, struct trace_event_file *file, filter_func_t filter) { struct trace_probe *tp; struct trace_uprobe *tu; bool enabled; int ret; tp = trace_probe_primary_from_call(call); if (WARN_ON_ONCE(!tp)) return -ENODEV; enabled = trace_probe_is_enabled(tp); /* This may also change "enabled" state */ if (file) { if (trace_probe_test_flag(tp, TP_FLAG_PROFILE)) return -EINTR; ret = trace_probe_add_file(tp, file); if (ret < 0) return ret; } else { if (trace_probe_test_flag(tp, TP_FLAG_TRACE)) return -EINTR; trace_probe_set_flag(tp, TP_FLAG_PROFILE); } tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); if (enabled) return 0; ret = uprobe_buffer_enable(); if (ret) goto err_flags; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { ret = trace_uprobe_enable(tu, filter); if (ret) { __probe_event_disable(tp); goto err_buffer; } } return 0; err_buffer: uprobe_buffer_disable(); err_flags: if (file) trace_probe_remove_file(tp, file); else trace_probe_clear_flag(tp, TP_FLAG_PROFILE); return ret; } static void probe_event_disable(struct trace_event_call *call, struct trace_event_file *file) { struct trace_probe *tp; tp = trace_probe_primary_from_call(call); if (WARN_ON_ONCE(!tp)) return; if (!trace_probe_is_enabled(tp)) return; if (file) { if (trace_probe_remove_file(tp, file) < 0) return; if (trace_probe_is_enabled(tp)) return; } else trace_probe_clear_flag(tp, TP_FLAG_PROFILE); __probe_event_disable(tp); uprobe_buffer_disable(); } static int uprobe_event_define_fields(struct trace_event_call *event_call) { int ret, size; struct uprobe_trace_entry_head field; struct trace_uprobe *tu; tu = trace_uprobe_primary_from_call(event_call); if (unlikely(!tu)) return -ENODEV; if (is_ret_probe(tu)) { DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); size = SIZEOF_TRACE_ENTRY(true); } else { DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); size = SIZEOF_TRACE_ENTRY(false); } return traceprobe_define_arg_fields(event_call, size, &tu->tp); } #ifdef CONFIG_PERF_EVENTS static bool __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) { struct perf_event *event; list_for_each_entry(event, &filter->perf_events, hw.tp_list) { if (event->hw.target->mm == mm) return true; } return false; } static inline bool trace_uprobe_filter_event(struct trace_uprobe_filter *filter, struct perf_event *event) { return __uprobe_perf_filter(filter, event->hw.target->mm); } static bool trace_uprobe_filter_remove(struct trace_uprobe_filter *filter, struct perf_event *event) { bool done; write_lock(&filter->rwlock); if (event->hw.target) { list_del(&event->hw.tp_list); done = filter->nr_systemwide || (event->hw.target->flags & PF_EXITING) || trace_uprobe_filter_event(filter, event); } else { filter->nr_systemwide--; done = filter->nr_systemwide; } write_unlock(&filter->rwlock); return done; } /* This returns true if the filter always covers target mm */ static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter, struct perf_event *event) { bool done; write_lock(&filter->rwlock); if (event->hw.target) { /* * event->parent != NULL means copy_process(), we can avoid * uprobe_apply(). current->mm must be probed and we can rely * on dup_mmap() which preserves the already installed bp's. * * attr.enable_on_exec means that exec/mmap will install the * breakpoints we need. */ done = filter->nr_systemwide || event->parent || event->attr.enable_on_exec || trace_uprobe_filter_event(filter, event); list_add(&event->hw.tp_list, &filter->perf_events); } else { done = filter->nr_systemwide; filter->nr_systemwide++; } write_unlock(&filter->rwlock); return done; } static int uprobe_perf_close(struct trace_event_call *call, struct perf_event *event) { struct trace_probe *tp; struct trace_uprobe *tu; int ret = 0; tp = trace_probe_primary_from_call(call); if (WARN_ON_ONCE(!tp)) return -ENODEV; tu = container_of(tp, struct trace_uprobe, tp); if (trace_uprobe_filter_remove(tu->tp.event->filter, event)) return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { ret = uprobe_apply(tu->uprobe, &tu->consumer, false); if (ret) break; } return ret; } static int uprobe_perf_open(struct trace_event_call *call, struct perf_event *event) { struct trace_probe *tp; struct trace_uprobe *tu; int err = 0; tp = trace_probe_primary_from_call(call); if (WARN_ON_ONCE(!tp)) return -ENODEV; tu = container_of(tp, struct trace_uprobe, tp); if (trace_uprobe_filter_add(tu->tp.event->filter, event)) return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { err = uprobe_apply(tu->uprobe, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); break; } } return err; } static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { struct trace_uprobe_filter *filter; struct trace_uprobe *tu; int ret; tu = container_of(uc, struct trace_uprobe, consumer); filter = tu->tp.event->filter; /* * speculative short-circuiting check to avoid unnecessarily taking * filter->rwlock below, if the uprobe has system-wide consumer */ if (READ_ONCE(filter->nr_systemwide)) return true; read_lock(&filter->rwlock); ret = __uprobe_perf_filter(filter, mm); read_unlock(&filter->rwlock); return ret; } static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; struct uprobe_cpu_buffer *ucb; struct hlist_head *head; void *data; int size, esize; int rctx; #ifdef CONFIG_BPF_EVENTS if (bpf_prog_array_valid(call)) { const struct bpf_prog_array *array; u32 ret; rcu_read_lock_trace(); array = rcu_dereference_check(call->prog_array, rcu_read_lock_trace_held()); ret = bpf_prog_run_array_uprobe(array, regs, bpf_prog_run); rcu_read_unlock_trace(); if (!ret) return; } #endif /* CONFIG_BPF_EVENTS */ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = prepare_uprobe_buffer(tu, regs, ucbp); size = esize + ucb->dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; preempt_disable(); head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) goto out; entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) goto out; if (is_ret_probe(tu)) { entry->vaddr[0] = func; entry->vaddr[1] = instruction_pointer(regs); data = DATAOF_TRACE_ENTRY(entry, true); } else { entry->vaddr[0] = instruction_pointer(regs); data = DATAOF_TRACE_ENTRY(entry, false); } memcpy(data, ucb->buf, ucb->dsize); if (size - esize > ucb->dsize) memset(data + ucb->dsize, 0, size - esize - ucb->dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); out: preempt_enable(); } /* uprobe profile handler */ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { if (!uprobe_perf_filter(&tu->consumer, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) __uprobe_perf_func(tu, 0, regs, ucbp); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { __uprobe_perf_func(tu, func, regs, ucbp); } int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, const char **filename, u64 *probe_offset, u64 *probe_addr, bool perf_type_tracepoint) { const char *pevent = trace_event_name(event->tp_event); const char *group = event->tp_event->class->system; struct trace_uprobe *tu; if (perf_type_tracepoint) tu = find_probe_event(pevent, group); else tu = trace_uprobe_primary_from_call(event->tp_event); if (!tu) return -EINVAL; *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE : BPF_FD_TYPE_UPROBE; *filename = tu->filename; *probe_offset = tu->offset; *probe_addr = tu->ref_ctr_offset; return 0; } #endif /* CONFIG_PERF_EVENTS */ static int trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: return probe_event_enable(event, file, NULL); case TRACE_REG_UNREGISTER: probe_event_disable(event, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: return probe_event_enable(event, NULL, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: probe_event_disable(event, NULL); return 0; case TRACE_REG_PERF_OPEN: return uprobe_perf_open(event, data); case TRACE_REG_PERF_CLOSE: return uprobe_perf_close(event, data); #endif default: return 0; } } static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, __u64 *data) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb = NULL; unsigned int flags; int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); this_cpu_inc(*tu->nhits); udd.tu = tu; udd.bp_addr = instruction_pointer(regs); current->utask->vaddr = (unsigned long) &udd; if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; flags = trace_probe_load_flag(&tu->tp); if (flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (flags & TP_FLAG_PROFILE) ret |= uprobe_perf_func(tu, regs, &ucb); #endif uprobe_buffer_put(ucb); return ret; } static int uretprobe_dispatcher(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs, __u64 *data) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb = NULL; unsigned int flags; tu = container_of(con, struct trace_uprobe, consumer); udd.tu = tu; udd.bp_addr = func; current->utask->vaddr = (unsigned long) &udd; if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; flags = trace_probe_load_flag(&tu->tp); if (flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (flags & TP_FLAG_PROFILE) uretprobe_perf_func(tu, func, regs, &ucb); #endif uprobe_buffer_put(ucb); return 0; } static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; static struct trace_event_fields uprobe_fields_array[] = { { .type = TRACE_FUNCTION_TYPE, .define_fields = uprobe_event_define_fields }, {} }; static inline void init_trace_event_call(struct trace_uprobe *tu) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); call->event.funcs = &uprobe_funcs; call->class->fields_array = uprobe_fields_array; call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; call->class->reg = trace_uprobe_register; } static int register_uprobe_event(struct trace_uprobe *tu) { init_trace_event_call(tu); return trace_probe_register_event_call(&tu->tp); } static int unregister_uprobe_event(struct trace_uprobe *tu) { return trace_probe_unregister_event_call(&tu->tp); } #ifdef CONFIG_PERF_EVENTS struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, unsigned long ref_ctr_offset, bool is_return) { enum probe_print_type ptype; struct trace_uprobe *tu; struct path path; int ret; ret = kern_path(name, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); if (!d_is_reg(path.dentry)) { path_put(&path); return ERR_PTR(-EINVAL); } /* * local trace_kprobes are not added to dyn_event, so they are never * searched in find_trace_kprobe(). Therefore, there is no concern of * duplicated name "DUMMY_EVENT" here. */ tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0, is_return); if (IS_ERR(tu)) { pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); path_put(&path); return ERR_CAST(tu); } tu->offset = offs; tu->path = path; tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); if (!tu->filename) { ret = -ENOMEM; goto error; } init_trace_event_call(tu); ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; if (traceprobe_set_print_fmt(&tu->tp, ptype) < 0) { ret = -ENOMEM; goto error; } return trace_probe_event_call(&tu->tp); error: free_trace_uprobe(tu); return ERR_PTR(ret); } void destroy_local_trace_uprobe(struct trace_event_call *event_call) { struct trace_uprobe *tu; tu = trace_uprobe_primary_from_call(event_call); free_trace_uprobe(tu); } #endif /* CONFIG_PERF_EVENTS */ /* Make a trace interface for controlling probe points */ static __init int init_uprobe_trace(void) { int ret; ret = dyn_event_register(&trace_uprobe_ops); if (ret) return ret; ret = tracing_init_dentry(); if (ret) return 0; trace_create_file("uprobe_events", TRACE_MODE_WRITE, NULL, NULL, &uprobe_events_ops); /* Profile interface */ trace_create_file("uprobe_profile", TRACE_MODE_READ, NULL, NULL, &uprobe_profile_ops); return 0; } fs_initcall(init_uprobe_trace); |
| 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/sysctl.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/in.h> #include <linux/if_ether.h> /* For the statistics structure. */ #include <linux/slab.h> #include <linux/uaccess.h> #include <asm/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> /* * Only allow IP over NET/ROM frames through if the netrom device is up. */ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; if (!netif_running(dev)) { stats->rx_dropped++; return 0; } stats->rx_packets++; stats->rx_bytes += skb->len; skb->protocol = htons(ETH_P_IP); /* Spoof incoming device */ skb->dev = dev; skb->mac_header = skb->network_header; skb_reset_network_header(skb); skb->pkt_type = PACKET_HOST; netif_rx(skb); return 1; } static int nr_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { unsigned char *buff = skb_push(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); memcpy(buff, (saddr != NULL) ? saddr : dev->dev_addr, dev->addr_len); buff[6] &= ~AX25_CBIT; buff[6] &= ~AX25_EBIT; buff[6] |= AX25_SSSID_SPARE; buff += AX25_ADDR_LEN; if (daddr != NULL) memcpy(buff, daddr, dev->addr_len); buff[6] &= ~AX25_CBIT; buff[6] |= AX25_EBIT; buff[6] |= AX25_SSSID_SPARE; buff += AX25_ADDR_LEN; *buff++ = READ_ONCE(sysctl_netrom_network_ttl_initialiser); *buff++ = NR_PROTO_IP; *buff++ = NR_PROTO_IP; *buff++ = 0; *buff++ = 0; *buff++ = NR_PROTOEXT; if (daddr != NULL) return 37; return -37; } static int __must_check nr_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; int err; if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) return 0; if (dev->flags & IFF_UP) { err = ax25_listen_register((ax25_address *)sa->sa_data, NULL); if (err) return err; ax25_listen_release((const ax25_address *)dev->dev_addr, NULL); } dev_addr_set(dev, sa->sa_data); return 0; } static int nr_open(struct net_device *dev) { int err; err = ax25_listen_register((const ax25_address *)dev->dev_addr, NULL); if (err) return err; netif_start_queue(dev); return 0; } static int nr_close(struct net_device *dev) { ax25_listen_release((const ax25_address *)dev->dev_addr, NULL); netif_stop_queue(dev); return 0; } static netdev_tx_t nr_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len = skb->len; if (!nr_route_frame(skb, NULL)) { kfree_skb(skb); stats->tx_errors++; return NETDEV_TX_OK; } stats->tx_packets++; stats->tx_bytes += len; return NETDEV_TX_OK; } static const struct header_ops nr_header_ops = { .create = nr_header, }; static const struct net_device_ops nr_netdev_ops = { .ndo_open = nr_open, .ndo_stop = nr_close, .ndo_start_xmit = nr_xmit, .ndo_set_mac_address = nr_set_mac_address, }; void nr_setup(struct net_device *dev) { dev->mtu = NR_MAX_PACKET_SIZE; dev->netdev_ops = &nr_netdev_ops; dev->header_ops = &nr_header_ops; dev->hard_header_len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; dev->addr_len = AX25_ADDR_LEN; dev->type = ARPHRD_NETROM; /* New-style flags. */ dev->flags = IFF_NOARP; } |
| 64 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_BIT_H #define _LINUX_WAIT_BIT_H /* * Linux wait-bit related types and methods: */ #include <linux/wait.h> struct wait_bit_key { unsigned long *flags; int bit_nr; unsigned long timeout; }; struct wait_bit_queue_entry { struct wait_bit_key key; struct wait_queue_entry wq_entry; }; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit); int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); void wake_up_bit(unsigned long *word, int bit); int out_of_line_wait_on_bit(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); int out_of_line_wait_on_bit_timeout(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); int out_of_line_wait_on_bit_lock(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); struct wait_queue_head *bit_waitqueue(unsigned long *word, int bit); extern void __init wait_bit_init(void); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_BIT(name, word, bit) \ struct wait_bit_queue_entry name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ .wq_entry = { \ .private = current, \ .func = wake_bit_function, \ .entry = \ LIST_HEAD_INIT((name).wq_entry.entry), \ }, \ } extern int bit_wait(struct wait_bit_key *key, int mode); extern int bit_wait_io(struct wait_bit_key *key, int mode); extern int bit_wait_timeout(struct wait_bit_key *key, int mode); /** * wait_on_bit - wait for a bit to be cleared * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the target bit, even * if other processes on the same queue are waiting for other bits. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal. */ static inline int wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait, mode); } /** * wait_on_bit_io - wait for a bit to be cleared * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), but calls io_schedule() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal. */ static inline int wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait_io, mode); } /** * wait_on_bit_timeout - wait for a bit to be cleared or a timeout to elapse * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * @timeout: timeout, in jiffies * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared, or for a timeout to expire. The * clearing of the bit must be signalled with wake_up_bit(), often as * clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), except it also takes a timeout * parameter. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal, or %-EAGAIN if the * timeout elapsed. */ static inline int wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, unsigned long timeout) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit_timeout(word, bit, bit_wait_timeout, mode, timeout); } /** * wait_on_bit_action - wait for a bit to be cleared * @word: the address containing the bit waited on * @bit: the bit at that address being waited on * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), but calls @action() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or the error code returned by @action if * that call returned non-zero. */ static inline int wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); } /** * wait_on_bit_lock - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As * soon as it is clear, atomically set it and return. * * This is similar to wait_on_bit(), but sets the bit before returning. * * Returned value will be zero if the bit was successfully set in which * case the call has the same memory sequencing semantics as * test_and_clear_bit(), or %-EINTR if the process received a signal and * the mode permitted wake up on that signal. */ static inline int wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); } /** * wait_on_bit_lock_io - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As * soon as it is clear, atomically set it and return. * * This is similar to wait_on_bit_lock(), but calls io_schedule() instead * of schedule(). * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and * the @mode allows that signal to wake the process. */ static inline int wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); } /** * wait_on_bit_lock_action - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * This is similar to wait_on_bit_lock(), but calls @action() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was successfully set in which * case the call has the same memory sequencing semantics as * test_and_clear_bit(), or the error code returned by @action if that * call returned non-zero. */ static inline int wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, action, mode); } extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags); extern void wake_up_var(void *var); extern wait_queue_head_t *__var_waitqueue(void *p); #define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_head *__wq_head = __var_waitqueue(var); \ struct wait_bit_queue_entry __wbq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_var_entry(&__wbq_entry, var, \ exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(__wq_head, \ &__wbq_entry.wq_entry, \ state); \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ } \ finish_wait(__wq_head, &__wbq_entry.wq_entry); \ __out: __ret; \ }) #define __wait_var_event(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) #define __wait_var_event_io(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) /** * wait_var_event - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true, only re-checking when a wake up is * received for the given @var (an arbitrary kernel address which need * not be directly related to the given condition, but usually is). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the given address. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event(var, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_var_event(var, condition); \ } while (0) /** * wait_var_event_io - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for an IO related @condition to be true, only re-checking when a * wake up is received for the given @var (an arbitrary kernel address * which need not be directly related to the given condition, but * usually is). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the given address. * * This is similar to wait_var_event(), but calls io_schedule() instead * of schedule(). * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_io(var, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_var_event_io(var, condition); \ } while (0) #define __wait_var_event_killable(var, condition) \ ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ schedule()) /** * wait_var_event_killable - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true or a fatal signal to be received, * only re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is * 0 if the condition became true, or %-ERESTARTSYS if a fatal signal * was received. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_killable(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_killable(var, condition); \ __ret; \ }) #define __wait_var_event_timeout(var, condition, timeout) \ ___wait_var_event(var, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_var_event_timeout - wait for a variable to be updated or a timeout to expire * @var: the address of variable being waited on * @condition: the condition to wait for * @timeout: maximum time to wait in jiffies * * Wait for a @condition to be true or a timeout to expire, only * re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is 0 if * the timeout expired and the condition was still false, or the * remaining time left in the timeout (but at least 1) if the condition * was found to be true. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_timeout(var, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_var_event_timeout(var, condition, timeout); \ __ret; \ }) #define __wait_var_event_interruptible(var, condition) \ ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_var_event_killable - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true or a signal to be received, only * re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is 0 if * the condition became true, or %-ERESTARTSYS if a signal was received. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_interruptible(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_interruptible(var, condition); \ __ret; \ }) /** * wait_var_event_any_lock - wait for a variable to be updated under a lock * @var: the address of the variable being waited on * @condition: condition to wait for * @lock: the object that is locked to protect updates to the variable * @type: prefix on lock and unlock operations * @state: waiting state, %TASK_UNINTERRUPTIBLE etc. * * Wait for a condition which can only be reliably tested while holding * a lock. The variables assessed in the condition will normal be updated * under the same lock, and the wake up should be signalled with * wake_up_var_locked() under the same lock. * * This is similar to wait_var_event(), but assumes a lock is held * while calling this function and while updating the variable. * * This must be called while the given lock is held and the lock will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. The functions used to * unlock and lock the object are constructed by appending _unlock and _lock * to @type. * * Return %-ERESTARTSYS if a signal arrives which is allowed to interrupt * the wait according to @state. */ #define wait_var_event_any_lock(var, condition, lock, type, state) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = ___wait_var_event(var, condition, state, 0, 0, \ type ## _unlock(lock); \ schedule(); \ type ## _lock(lock)); \ __ret; \ }) /** * wait_var_event_spinlock - wait for a variable to be updated under a spinlock * @var: the address of the variable being waited on * @condition: condition to wait for * @lock: the spinlock which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a spinlock. The variables assessed in the condition will normal be updated * under the same spinlock, and the wake up should be signalled with * wake_up_var_locked() under the same spinlock. * * This is similar to wait_var_event(), but assumes a spinlock is held * while calling this function and while updating the variable. * * This must be called while the given lock is held and the lock will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. */ #define wait_var_event_spinlock(var, condition, lock) \ wait_var_event_any_lock(var, condition, lock, spin, TASK_UNINTERRUPTIBLE) /** * wait_var_event_mutex - wait for a variable to be updated under a mutex * @var: the address of the variable being waited on * @condition: condition to wait for * @mutex: the mutex which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a mutex. The variables assessed in the condition will normal be * updated under the same mutex, and the wake up should be signalled * with wake_up_var_locked() under the same mutex. * * This is similar to wait_var_event(), but assumes a mutex is held * while calling this function and while updating the variable. * * This must be called while the given mutex is held and the mutex will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. */ #define wait_var_event_mutex(var, condition, lock) \ wait_var_event_any_lock(var, condition, lock, mutex, TASK_UNINTERRUPTIBLE) /** * wake_up_var_protected - wake up waiters for a variable asserting that it is safe * @var: the address of the variable being waited on * @cond: the condition which afirms this is safe * * When waking waiters which use wait_var_event_any_lock() the waker must be * holding the reelvant lock to avoid races. This version of wake_up_var() * asserts that the relevant lock is held and so no barrier is needed. * The @cond is only tested when CONFIG_LOCKDEP is enabled. */ #define wake_up_var_protected(var, cond) \ do { \ lockdep_assert(cond); \ wake_up_var(var); \ } while (0) /** * wake_up_var_locked - wake up waiters for a variable while holding a spinlock or mutex * @var: the address of the variable being waited on * @lock: The spinlock or mutex what protects the variable * * Send a wake up for the given variable which should be waited for with * wait_var_event_spinlock() or wait_var_event_mutex(). Unlike wake_up_var(), * no extra barriers are needed as the locking provides sufficient sequencing. */ #define wake_up_var_locked(var, lock) \ wake_up_var_protected(var, lockdep_is_held(lock)) /** * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit * @bit: the bit of the word being waited on * @word: the address containing the bit being waited on * * The designated bit is cleared and any tasks waiting in wait_on_bit() * or similar will be woken. This call has RELEASE semantics so that * any changes to memory made before this call are guaranteed to be visible * after the corresponding wait_on_bit() completes. */ static inline void clear_and_wake_up_bit(int bit, unsigned long *word) { clear_bit_unlock(bit, word); /* See wake_up_bit() for which memory barrier you need to use. */ smp_mb__after_atomic(); wake_up_bit(word, bit); } /** * test_and_clear_wake_up_bit - clear a bit if it was set: wake up anyone waiting on that bit * @bit: the bit of the word being waited on * @word: the address of memory containing that bit * * If the bit is set and can be atomically cleared, any tasks waiting in * wait_on_bit() or similar will be woken. This call has the same * complete ordering semantics as test_and_clear_bit(). Any changes to * memory made before this call are guaranteed to be visible after the * corresponding wait_on_bit() completes. * * Returns %true if the bit was successfully set and the wake up was sent. */ static inline bool test_and_clear_wake_up_bit(int bit, unsigned long *word) { if (!test_and_clear_bit(bit, word)) return false; /* no extra barrier required */ wake_up_bit(word, bit); return true; } /** * atomic_dec_and_wake_up - decrement an atomic_t and if zero, wake up waiters * @var: the variable to dec and test * * Decrements the atomic variable and if it reaches zero, send a wake_up to any * processes waiting on the variable. * * This function has the same complete ordering semantics as atomic_dec_and_test. * * Returns %true is the variable reaches zero and the wake up was sent. */ static inline bool atomic_dec_and_wake_up(atomic_t *var) { if (!atomic_dec_and_test(var)) return false; /* No extra barrier required */ wake_up_var(var); return true; } /** * store_release_wake_up - update a variable and send a wake_up * @var: the address of the variable to be updated and woken * @val: the value to store in the variable. * * Store the given value in the variable send a wake up to any tasks * waiting on the variable. All necessary barriers are included to ensure * the task calling wait_var_event() sees the new value and all values * written to memory before this call. */ #define store_release_wake_up(var, val) \ do { \ smp_store_release(var, val); \ smp_mb(); \ wake_up_var(var); \ } while (0) #endif /* _LINUX_WAIT_BIT_H */ |
| 153 153 6756 6677 329 328 416 416 20 2 2 2 508 507 171 171 170 171 171 170 171 173 167 82 81 82 82 75 4 6765 6705 321 82 3 3 3 79 79 79 79 79 77 5 5 5 5 5 5 1022 1026 1028 1022 1023 1013 1018 1026 1022 1021 1013 1016 1026 1021 1023 3 3 3 3 3 2 2 3 3 3 10 2 2 2 2 6398 6404 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 | // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; static LIST_HEAD(kthreads_hotplug); static DEFINE_MUTEX(kthreads_hotplug_lock); struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; unsigned int node; int started; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; struct task_struct *task; struct list_head hotplug_node; struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will * always remain a kthread. For kthreads p->worker_private always * points to a struct kthread. For tasks that are not kthreads * p->worker_private is used to point to other things. * * Return NULL for any task that is not a kthread. */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; kthread->task = p; kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->hotplug_node)) { mutex_lock(&kthreads_hotplug_lock); list_del(&kthread->hotplug_node); mutex_unlock(&kthreads_hotplug_lock); if (kthread->preferred_affinity) { kfree(kthread->preferred_affinity); kthread->preferred_affinity = NULL; } } do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) { const struct cpumask *pref; if (kthread->preferred_affinity) { pref = kthread->preferred_affinity; } else { if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) return; pref = cpumask_of_node(kthread->node); } cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); if (cpumask_empty(cpumask)) cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); } static void kthread_affine_node(void) { struct kthread *kthread = to_kthread(current); cpumask_var_t affinity; WARN_ON_ONCE(kthread_is_per_cpu(current)); if (kthread->node == NUMA_NO_NODE) { housekeeping_affine(current, HK_TYPE_KTHREAD); } else { if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { WARN_ON_ONCE(1); return; } mutex_lock(&kthreads_hotplug_lock); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); /* * The node cpumask is racy when read from kthread() but: * - a racing CPU going down will either fail on the subsequent * call to set_cpus_allowed_ptr() or be migrated to housekeepers * afterwards by the scheduler. * - a racing CPU going up will be handled by kthreads_online_cpu() */ kthread_fetch_affinity(kthread, affinity); set_cpus_allowed_ptr(current, affinity); mutex_unlock(&kthreads_hotplug_lock); free_cpumask_var(affinity); } } static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); self->started = 1; if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) kthread_affine_node(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } scoped_guard (raw_spinlock_irqsave, &p->pi_lock) set_cpus_allowed_force(p, mask); /* It's safe because the task is inactive. */ p->flags |= PF_NO_SETAFFINITY; } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { static const char comm[TASK_COMM_LEN] = "kthreadd"; struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, comm); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); return -EINVAL; } WARN_ON_ONCE(kthread->preferred_affinity); if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!kthread->preferred_affinity) { ret = -ENOMEM; goto out; } mutex_lock(&kthreads_hotplug_lock); cpumask_copy(kthread->preferred_affinity, mask); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); scoped_guard (raw_spinlock_irqsave, &p->pi_lock) set_cpus_allowed_force(p, affinity); mutex_unlock(&kthreads_hotplug_lock); out: free_cpumask_var(affinity); return ret; } EXPORT_SYMBOL_GPL(kthread_affine_preferred); /* * Re-affine kthreads according to their preferences * and the newly online CPU. The CPU down part is handled * by select_fallback_rq() which default re-affines to * housekeepers from other nodes in case the preferred * affinity doesn't apply anymore. */ static int kthreads_online_cpu(unsigned int cpu) { cpumask_var_t affinity; struct kthread *k; int ret; guard(mutex)(&kthreads_hotplug_lock); if (list_empty(&kthreads_hotplug)) return 0; if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; ret = 0; list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || kthread_is_per_cpu(k->task))) { ret = -EINVAL; continue; } kthread_fetch_affinity(k, affinity); set_cpus_allowed_ptr(k->task, affinity); } free_cpumask_var(affinity); return ret; } static int kthreads_init(void) { return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", kthreads_online_cpu, NULL); } early_initcall(kthreads_init); void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; worker->flags = flags; worker->task = task; return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_node); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { struct kthread_worker *worker; worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); if (!IS_ERR(worker)) kthread_bind(worker->task, cpu); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_create_worker(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = timer_container_of(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * timer_delete_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); timer_delete_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif |
| 20 19 49 49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 | // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2020, Red Hat, Inc. */ #define pr_fmt(fmt) "MPTCP: " fmt #include "protocol.h" #include "mptcp_pm_gen.h" #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 static const struct genl_multicast_group mptcp_pm_mcgrps[] = { [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, [MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME, .flags = GENL_MCAST_CAP_NET_ADMIN, }, }; static int mptcp_pm_family_to_addr(int family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (family == AF_INET6) return MPTCP_PM_ADDR_ATTR_ADDR6; #endif return MPTCP_PM_ADDR_ATTR_ADDR4; } static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], const struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr, bool require_family) { int err, addr_addr; if (!attr) { GENL_SET_ERR_MSG(info, "missing address info"); return -EINVAL; } /* no validation needed - was already done via nested policy */ err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, mptcp_pm_address_nl_policy, info->extack); if (err) return err; if (tb[MPTCP_PM_ADDR_ATTR_ID]) addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { if (!require_family) return 0; NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing family"); return -EINVAL; } addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); if (addr->family != AF_INET #if IS_ENABLED(CONFIG_MPTCP_IPV6) && addr->family != AF_INET6 #endif ) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "unknown address family"); return -EINVAL; } addr_addr = mptcp_pm_family_to_addr(addr->family); if (!tb[addr_addr]) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing address data"); return -EINVAL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr->family == AF_INET6) addr->addr6 = nla_get_in6_addr(tb[addr_addr]); else #endif addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); return 0; } int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; memset(addr, 0, sizeof(*addr)); return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true); } int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; int err; memset(entry, 0, sizeof(*entry)); err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family); if (err) return err; if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { s32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); return 0; } static int mptcp_nl_fill_addr(struct sk_buff *skb, struct mptcp_pm_addr_entry *entry) { struct mptcp_addr_info *addr = &entry->addr; struct nlattr *attr; attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR); if (!attr) return -EMSGSIZE; if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family)) goto nla_put_failure; if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) goto nla_put_failure; if (entry->ifindex && nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) goto nla_put_failure; if (addr->family == AF_INET && nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4, addr->addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6 && nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6)) goto nla_put_failure; #endif nla_nest_end(skb, attr); return 0; nla_put_failure: nla_nest_cancel(skb, attr); return -EMSGSIZE; } static int mptcp_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info) { if (info->attrs[MPTCP_PM_ATTR_TOKEN]) return mptcp_userspace_pm_get_addr(id, addr, info); return mptcp_pm_nl_get_addr(id, addr, info); } int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) { struct mptcp_pm_addr_entry addr; struct nlattr *attr; struct sk_buff *msg; void *reply; int ret; if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) return -EINVAL; attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, info->genlhdr->cmd); if (!reply) { GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); ret = -EMSGSIZE; goto fail; } ret = mptcp_pm_get_addr(addr.addr.id, &addr, info); if (ret) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); goto fail; } ret = mptcp_nl_fill_addr(msg, &addr); if (ret) goto fail; genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); return ret; fail: nlmsg_free(msg); return ret; } int mptcp_pm_genl_fill_addr(struct sk_buff *msg, struct netlink_callback *cb, struct mptcp_pm_addr_entry *entry) { void *hdr; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &mptcp_genl_family, NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); if (!hdr) return -EINVAL; if (mptcp_nl_fill_addr(msg, entry) < 0) { genlmsg_cancel(msg, hdr); return -EINVAL; } genlmsg_end(msg, hdr); return 0; } static int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); if (info->attrs[MPTCP_PM_ATTR_TOKEN]) return mptcp_userspace_pm_dump_addr(msg, cb); return mptcp_pm_nl_dump_addr(msg, cb); } int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { return mptcp_pm_dump_addr(msg, cb); } static int mptcp_pm_set_flags(struct genl_info *info) { struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; struct nlattr *attr_loc; int ret = -EINVAL; if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR)) return ret; attr_loc = info->attrs[MPTCP_PM_ATTR_ADDR]; ret = mptcp_pm_parse_entry(attr_loc, info, false, &loc); if (ret < 0) return ret; if (info->attrs[MPTCP_PM_ATTR_TOKEN]) return mptcp_userspace_pm_set_flags(&loc, info); return mptcp_pm_nl_set_flags(&loc, info); } int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) { return mptcp_pm_set_flags(info); } static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) { genlmsg_multicast_netns(&mptcp_genl_family, net, nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp); } bool mptcp_userspace_pm_active(const struct mptcp_sock *msk) { return genl_has_listeners(&mptcp_genl_family, sock_net((const struct sock *)msk), MPTCP_PM_EV_GRP_OFFSET); } static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) { const struct inet_sock *issk = inet_sk(ssk); const struct mptcp_subflow_context *sf; if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) return -EMSGSIZE; switch (ssk->sk_family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) return -EMSGSIZE; if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, issk->inet_daddr)) return -EMSGSIZE; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) return -EMSGSIZE; if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) return -EMSGSIZE; break; } #endif default: WARN_ON_ONCE(1); return -EMSGSIZE; } if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) return -EMSGSIZE; if (nla_put_be16(skb, MPTCP_ATTR_DPORT, issk->inet_dport)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (WARN_ON_ONCE(!sf)) return -EINVAL; if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, subflow_get_local_id(sf))) return -EMSGSIZE; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id)) return -EMSGSIZE; return 0; } static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { const struct sock *sk = (const struct sock *)msk; const struct mptcp_subflow_context *sf; u8 sk_err; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) return -EMSGSIZE; if (mptcp_event_add_subflow(skb, ssk)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (WARN_ON_ONCE(!sf)) return -EINVAL; if (nla_put_u8(skb, MPTCP_ATTR_BACKUP, sf->backup)) return -EMSGSIZE; if (ssk->sk_bound_dev_if && nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) return -EMSGSIZE; sk_err = READ_ONCE(ssk->sk_err); if (sk_err && sk->sk_state == TCP_ESTABLISHED && nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err)) return -EMSGSIZE; return 0; } static int mptcp_event_sub_established(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { return mptcp_event_put_token_and_ssk(skb, msk, ssk); } static int mptcp_event_sub_closed(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { const struct mptcp_subflow_context *sf; if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (!sf->reset_seen) return 0; if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason)) return -EMSGSIZE; if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient)) return -EMSGSIZE; return 0; } static int mptcp_event_created(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); u16 flags = 0; if (err) return err; if (READ_ONCE(msk->pm.server_side)) { flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE; /* Deprecated, and only set when it is the server side */ if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) return -EMSGSIZE; } if (READ_ONCE(msk->pm.remote_deny_join_id0)) flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) return -EMSGSIZE; return mptcp_event_add_subflow(skb, ssk); } void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) { struct net *net = sock_net((const struct sock *)msk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_REMOVED); if (!nlh) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id)) goto nla_put_failure; genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_ANNOUNCED); if (!nlh) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) goto nla_put_failure; if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port == 0 ? inet_sk(ssk)->inet_dport : info->port)) goto nla_put_failure; switch (info->family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, info->addr.s_addr)) goto nla_put_failure; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &info->addr6)) goto nla_put_failure; break; #endif default: WARN_ON_ONCE(1); goto nla_put_failure; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event) { const struct inet_sock *issk = inet_sk(ssk); struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, event); if (!nlh) goto nla_put_failure; if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) goto nla_put_failure; if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) goto nla_put_failure; switch (ssk->sk_family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) goto nla_put_failure; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) goto nla_put_failure; break; } #endif default: WARN_ON_ONCE(1); goto nla_put_failure; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_KERNEL); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) { struct net *net = sock_net((const struct sock *)msk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, type); if (!nlh) goto nla_put_failure; switch (type) { case MPTCP_EVENT_UNSPEC: WARN_ON_ONCE(1); break; case MPTCP_EVENT_CREATED: case MPTCP_EVENT_ESTABLISHED: if (mptcp_event_created(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_CLOSED: if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)) < 0) goto nla_put_failure; break; case MPTCP_EVENT_ANNOUNCED: case MPTCP_EVENT_REMOVED: /* call mptcp_event_addr_announced()/removed instead */ WARN_ON_ONCE(1); break; case MPTCP_EVENT_SUB_ESTABLISHED: case MPTCP_EVENT_SUB_PRIORITY: if (mptcp_event_sub_established(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_SUB_CLOSED: if (mptcp_event_sub_closed(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_LISTENER_CREATED: case MPTCP_EVENT_LISTENER_CLOSED: break; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, gfp); return; nla_put_failure: nlmsg_free(skb); } struct genl_family mptcp_genl_family __ro_after_init = { .name = MPTCP_PM_NAME, .version = MPTCP_PM_VER, .netnsok = true, .module = THIS_MODULE, .ops = mptcp_pm_nl_ops, .n_ops = ARRAY_SIZE(mptcp_pm_nl_ops), .resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1, .mcgrps = mptcp_pm_mcgrps, .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), }; void __init mptcp_pm_nl_init(void) { if (genl_register_family(&mptcp_genl_family)) panic("Failed to register MPTCP PM netlink family\n"); } |
| 3652 5144 5052 3 1755 45 1560 6 748 10 10 550 4695 17 10 5 2943 2934 18 550 8614 7094 9046 8614 7683 7697 7498 660 71 53 49 3 49 49 49 45 19 49 49 685 683 74 1518 18 18 18 259 31 31 31 13 51 660 3071 2686 3542 3549 2998 11 514 538 539 467 539 52 520 518 463 56 1 25 539 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_NOTIFY_H #define _LINUX_FS_NOTIFY_H /* * include/linux/fsnotify.h - generic hooks for filesystem notification, to * reduce in-source duplication from both dnotify and inotify. * * We don't compile any of this away in some complicated menagerie of ifdefs. * Instead, we rely on the code inside to optimize away as needed. * * (C) Copyright 2005 Robert Love */ #include <linux/fsnotify_backend.h> #include <linux/audit.h> #include <linux/slab.h> #include <linux/bug.h> /* Are there any inode/mount/sb objects watched with priority prio or above? */ static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb, int prio) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); /* Were any marks ever added to any object on this sb? */ if (!sbinfo) return false; return atomic_long_read(&sbinfo->watched_objects[prio]); } /* Are there any inode/mount/sb objects that are being watched at all? */ static inline bool fsnotify_sb_has_watchers(struct super_block *sb) { return fsnotify_sb_has_priority_watchers(sb, 0); } /* * Notify this @dir inode about a change in a child directory entry. * The directory entry may have turned positive or negative or its inode may * have changed (i.e. renamed over). * * Unlike fsnotify_parent(), the event will be reported regardless of the * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only * the child is interested and not the parent. */ static inline int fsnotify_name(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { if (!fsnotify_sb_has_watchers(dir->i_sb)) return 0; return fsnotify(mask, data, data_type, dir, name, NULL, cookie); } static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, __u32 mask) { fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0); } static inline void fsnotify_inode(struct inode *inode, __u32 mask) { if (!fsnotify_sb_has_watchers(inode->i_sb)) return; if (S_ISDIR(inode->i_mode)) mask |= FS_ISDIR; fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0); } /* Notify this dentry's parent about a child's events. */ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { struct inode *inode = d_inode(dentry); if (!fsnotify_sb_has_watchers(inode->i_sb)) return 0; if (S_ISDIR(inode->i_mode)) { mask |= FS_ISDIR; /* sb/mount marks are not interested in name of directory */ if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) goto notify_child; } /* disconnected dentry cannot notify parent */ if (IS_ROOT(dentry)) goto notify_child; return __fsnotify_parent(dentry, mask, data, data_type); notify_child: return fsnotify(mask, data, data_type, NULL, NULL, inode, 0); } /* * Simple wrappers to consolidate calls to fsnotify_parent() when an event * is on a file/dentry. */ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) { fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); } static inline int fsnotify_path(const struct path *path, __u32 mask) { return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } static inline int fsnotify_file(struct file *file, __u32 mask) { /* * FMODE_NONOTIFY are fds generated by fanotify itself which should not * generate new events. We also don't want to generate events for * FMODE_PATH fds (involves open & close events) as they are just * handle creation / destruction events and not "real" file events. */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; return fsnotify_path(&file->f_path, mask); } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS int fsnotify_open_perm_and_set_mode(struct file *file); /* * fsnotify_file_area_perm - permission hook before access to file range */ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { /* * filesystem may be modified in the context of permission events * (e.g. by HSM filling a file on access), so sb freeze protection * must not be held. */ lockdep_assert_once(file_write_not_started(file)); if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS))) return 0; /* * read()/write() and other types of access generate pre-content events. */ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { int ret = fsnotify_pre_content(&file->f_path, ppos, count); if (ret) return ret; } if (!(perm_mask & MAY_READ) || likely(!FMODE_FSNOTIFY_ACCESS_PERM(file->f_mode))) return 0; /* * read() also generates the legacy FS_ACCESS_PERM event, so content * scanners can inspect the content filled by pre-content event. */ return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } /* * fsnotify_mmap_perm - permission hook before mmap of file range */ static inline int fsnotify_mmap_perm(struct file *file, int prot, const loff_t off, size_t len) { /* * mmap() generates only pre-content events. */ if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode))) return 0; return fsnotify_pre_content(&file->f_path, &off, len); } /* * fsnotify_truncate_perm - permission hook before file truncate */ static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) { struct inode *inode = d_inode(path->dentry); if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) || !fsnotify_sb_has_priority_watchers(inode->i_sb, FSNOTIFY_PRIO_PRE_CONTENT)) return 0; return fsnotify_pre_content(path, &length, 0); } /* * fsnotify_file_perm - permission hook before file access (unknown range) */ static inline int fsnotify_file_perm(struct file *file, int perm_mask) { return fsnotify_file_area_perm(file, perm_mask, NULL, 0); } #else static inline int fsnotify_open_perm_and_set_mode(struct file *file) { return 0; } static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { return 0; } static inline int fsnotify_mmap_perm(struct file *file, int prot, const loff_t off, size_t len) { return 0; } static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) { return 0; } static inline int fsnotify_file_perm(struct file *file, int perm_mask) { return 0; } #endif /* * fsnotify_link_count - inode's link count changed */ static inline void fsnotify_link_count(struct inode *inode) { fsnotify_inode(inode, FS_ATTRIB); } /* * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir */ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, const struct qstr *old_name, int isdir, struct inode *target, struct dentry *moved) { struct inode *source = moved->d_inode; u32 fs_cookie = fsnotify_get_cookie(); __u32 old_dir_mask = FS_MOVED_FROM; __u32 new_dir_mask = FS_MOVED_TO; __u32 rename_mask = FS_RENAME; const struct qstr *new_name = &moved->d_name; if (isdir) { old_dir_mask |= FS_ISDIR; new_dir_mask |= FS_ISDIR; rename_mask |= FS_ISDIR; } /* Event with information about both old and new parent+name */ fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY, old_dir, old_name, 0); fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_dir, old_name, fs_cookie); fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE, new_dir, new_name, fs_cookie); if (target) fsnotify_link_count(target); fsnotify_inode(source, FS_MOVE_SELF); audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE); } /* * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed */ static inline void fsnotify_inode_delete(struct inode *inode) { __fsnotify_inode_delete(inode); } /* * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed */ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt) { __fsnotify_vfsmount_delete(mnt); } static inline void fsnotify_mntns_delete(struct mnt_namespace *mntns) { __fsnotify_mntns_delete(mntns); } /* * fsnotify_inoderemove - an inode is going away */ static inline void fsnotify_inoderemove(struct inode *inode) { fsnotify_inode(inode, FS_DELETE_SELF); __fsnotify_inode_delete(inode); } /* * fsnotify_create - 'name' was linked in * * Caller must make sure that dentry->d_name is stable. * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate * ->d_inode later */ static inline void fsnotify_create(struct inode *dir, struct dentry *dentry) { audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_dirent(dir, dentry, FS_CREATE); } /* * fsnotify_link - new hardlink in 'inode' directory * * Caller must make sure that new_dentry->d_name is stable. * Note: We have to pass also the linked inode ptr as some filesystems leave * new_dentry->d_inode NULL and instantiate inode pointer later */ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry) { fsnotify_link_count(inode); audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE, dir, &new_dentry->d_name, 0); } /* * fsnotify_delete - @dentry was unlinked and unhashed * * Caller must make sure that dentry->d_name is stable. * * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode * as this may be called after d_delete() and old_dentry may be negative. */ static inline void fsnotify_delete(struct inode *dir, struct inode *inode, struct dentry *dentry) { __u32 mask = FS_DELETE; if (S_ISDIR(inode->i_mode)) mask |= FS_ISDIR; fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name, 0); } /** * d_delete_notify - delete a dentry and call fsnotify_delete() * @dentry: The dentry to delete * * This helper is used to guaranty that the unlinked inode cannot be found * by lookup of this name after fsnotify_delete() event has been delivered. */ static inline void d_delete_notify(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); ihold(inode); d_delete(dentry); fsnotify_delete(dir, inode, dentry); iput(inode); } /* * fsnotify_unlink - 'name' was unlinked * * Caller must make sure that dentry->d_name is stable. */ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry) { if (WARN_ON_ONCE(d_is_negative(dentry))) return; fsnotify_delete(dir, d_inode(dentry), dentry); } /* * fsnotify_mkdir - directory 'name' was created * * Caller must make sure that dentry->d_name is stable. * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate * ->d_inode later */ static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry) { audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR); } /* * fsnotify_rmdir - directory 'name' was removed * * Caller must make sure that dentry->d_name is stable. */ static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry) { if (WARN_ON_ONCE(d_is_negative(dentry))) return; fsnotify_delete(dir, d_inode(dentry), dentry); } /* * fsnotify_access - file was read */ static inline void fsnotify_access(struct file *file) { fsnotify_file(file, FS_ACCESS); } /* * fsnotify_modify - file was modified */ static inline void fsnotify_modify(struct file *file) { fsnotify_file(file, FS_MODIFY); } /* * fsnotify_open - file was opened */ static inline void fsnotify_open(struct file *file) { __u32 mask = FS_OPEN; if (file->f_flags & __FMODE_EXEC) mask |= FS_OPEN_EXEC; fsnotify_file(file, mask); } /* * fsnotify_close - file was closed */ static inline void fsnotify_close(struct file *file) { __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE; fsnotify_file(file, mask); } /* * fsnotify_xattr - extended attributes were changed */ static inline void fsnotify_xattr(struct dentry *dentry) { fsnotify_dentry(dentry, FS_ATTRIB); } /* * fsnotify_change - notify_change event. file was modified and/or metadata * was changed. */ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) { __u32 mask = 0; if (ia_valid & ATTR_UID) mask |= FS_ATTRIB; if (ia_valid & ATTR_GID) mask |= FS_ATTRIB; if (ia_valid & ATTR_SIZE) mask |= FS_MODIFY; /* both times implies a utime(s) call */ if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME)) mask |= FS_ATTRIB; else if (ia_valid & ATTR_ATIME) mask |= FS_ACCESS; else if (ia_valid & ATTR_MTIME) mask |= FS_MODIFY; if (ia_valid & ATTR_MODE) mask |= FS_ATTRIB; if (mask) fsnotify_dentry(dentry, mask); } static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, int error) { struct fs_error_report report = { .error = error, .inode = inode, .sb = sb, }; return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL, NULL, 0); } static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt) { fsnotify_mnt(FS_MNT_ATTACH, ns, mnt); } static inline void fsnotify_mnt_detach(struct mnt_namespace *ns, struct vfsmount *mnt) { fsnotify_mnt(FS_MNT_DETACH, ns, mnt); } static inline void fsnotify_mnt_move(struct mnt_namespace *ns, struct vfsmount *mnt) { fsnotify_mnt(FS_MNT_MOVE, ns, mnt); } #endif /* _LINUX_FS_NOTIFY_H */ |
| 2 2 2 2 2 1 2 1 1 2 2 2 2 5 3 4 6 4 4 5 4 2 1 4 1 3 2 2 1 4 2 1 2 1 1 2 1 2 1 1 2 1 1 1 1 2 2 4 4 4 4 8 8 8 8 5 3 3 2 5 1 1 5 5 1 4 1 4 2 2 2 2 8 2 2 2 2 2 2 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) International Business Machines Corp., 2000,2005 * * Modified by Steve French (sfrench@us.ibm.com) */ #include <linux/fs.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/kstrtox.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/uaccess.h> #include <uapi/linux/ethtool.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" #include "fs_context.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif #ifdef CONFIG_CIFS_SMB_DIRECT #include "smbdirect.h" #include "../common/smbdirect/smbdirect_pdu.h" #endif #include "cifs_swn.h" #include "cached_dir.h" void cifs_dump_mem(char *label, void *data, int length) { pr_debug("%s: dump of %d bytes of data at 0x%p\n", label, length, data); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 16, 4, data, length, true); } void cifs_dump_detail(void *buf, struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 struct smb_hdr *smb = buf; cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d Wct: %d\n", smb->Command, smb->Status.CifsError, smb->Flags, smb->Flags2, smb->Mid, smb->Pid, smb->WordCount); if (!server->ops->check_message(buf, server->total_read, server)) { cifs_dbg(VFS, "smb buf %p len %u\n", smb, server->ops->calc_smb_size(smb)); } #endif /* CONFIG_CIFS_DEBUG2 */ } void cifs_dump_mids(struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 struct mid_q_entry *mid_entry; if (server == NULL) return; cifs_dbg(VFS, "Dump pending requests:\n"); spin_lock(&server->mid_queue_lock); list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n", mid_entry->mid_state, le16_to_cpu(mid_entry->command), mid_entry->pid, mid_entry->callback_data, mid_entry->mid); #ifdef CONFIG_CIFS_STATS2 cifs_dbg(VFS, "IsLarge: %d buf: %p time rcv: %ld now: %ld\n", mid_entry->large_buf, mid_entry->resp_buf, mid_entry->when_received, jiffies); #endif /* STATS2 */ cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n", mid_entry->multiRsp, mid_entry->multiEnd); if (mid_entry->resp_buf) { cifs_dump_detail(mid_entry->resp_buf, server); cifs_dump_mem("existing buf: ", mid_entry->resp_buf, 62); } } spin_unlock(&server->mid_queue_lock); #endif /* CONFIG_CIFS_DEBUG2 */ } #ifdef CONFIG_PROC_FS static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) { __u32 dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); seq_printf(m, "%s Mounts: %d ", tcon->tree_name, tcon->tc_count); if (tcon->nativeFileSystem) seq_printf(m, "Type: %s ", tcon->nativeFileSystem); seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x\n\tPathComponentMax: %d Status: %d", le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), le32_to_cpu(tcon->fsAttrInfo.Attributes), le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), tcon->status); if (dev_type == FILE_DEVICE_DISK) seq_puts(m, " type: DISK "); else if (dev_type == FILE_DEVICE_CD_ROM) seq_puts(m, " type: CDROM "); else seq_printf(m, " type: %d ", dev_type); seq_printf(m, "Serial Number: 0x%x", tcon->vol_serial_number); if ((tcon->seal) || (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) || (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)) seq_puts(m, " encrypted"); if (tcon->nocase) seq_printf(m, " nocase"); if (tcon->unix_ext) seq_printf(m, " POSIX Extensions"); if (tcon->ses->server->ops->dump_share_caps) tcon->ses->server->ops->dump_share_caps(m, tcon); if (tcon->use_witness) seq_puts(m, " Witness"); if (tcon->broken_sparse_sup) seq_puts(m, " nosparse"); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); spin_lock(&tcon->tc_lock); if (tcon->origin_fullpath) { seq_printf(m, "\n\tDFS origin fullpath: %s", tcon->origin_fullpath); } spin_unlock(&tcon->tc_lock); seq_putc(m, '\n'); } static void cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) { struct TCP_Server_Info *server = chan->server; if (!server) { seq_printf(m, "\n\n\t\tChannel: %d DISABLED", i+1); return; } seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx" "\n\t\tNumber of credits: %d,%d,%d Dialect 0x%x" "\n\t\tTCP status: %d Instance: %d" "\n\t\tLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d" "\n\t\tIn Send: %d In MaxReq Wait: %d", i+1, server->conn_id, server->credits, server->echo_credits, server->oplock_credits, server->dialect, server->tcpStatus, server->reconnect_instance, server->srv_count, server->sec_mode, in_flight(server), atomic_read(&server->in_send), atomic_read(&server->num_waiters)); #ifdef CONFIG_NET_NS if (server->net) seq_printf(m, " Net namespace: %u ", server->net->ns.inum); #endif /* NET_NS */ } static inline const char *smb_speed_to_str(size_t bps) { size_t mbps = bps / 1000 / 1000; switch (mbps) { case SPEED_10: return "10Mbps"; case SPEED_100: return "100Mbps"; case SPEED_1000: return "1Gbps"; case SPEED_2500: return "2.5Gbps"; case SPEED_5000: return "5Gbps"; case SPEED_10000: return "10Gbps"; case SPEED_14000: return "14Gbps"; case SPEED_20000: return "20Gbps"; case SPEED_25000: return "25Gbps"; case SPEED_40000: return "40Gbps"; case SPEED_50000: return "50Gbps"; case SPEED_56000: return "56Gbps"; case SPEED_100000: return "100Gbps"; case SPEED_200000: return "200Gbps"; case SPEED_400000: return "400Gbps"; case SPEED_800000: return "800Gbps"; default: return "Unknown"; } } static void cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; seq_printf(m, "\tSpeed: %s\n", smb_speed_to_str(iface->speed)); seq_puts(m, "\t\tCapabilities: "); if (iface->rdma_capable) seq_puts(m, "rdma "); if (iface->rss_capable) seq_puts(m, "rss "); if (!iface->rdma_capable && !iface->rss_capable) seq_puts(m, "None"); seq_putc(m, '\n'); if (iface->sockaddr.ss_family == AF_INET) seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr); else if (iface->sockaddr.ss_family == AF_INET6) seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr); if (!iface->is_active) seq_puts(m, "\t\t[for-cleanup]\n"); } static int cifs_debug_files_proc_show(struct seq_file *m, void *v) { struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsFileInfo *cfile; struct inode *inode; struct cifsInodeInfo *cinode; char lease[4]; int n; seq_puts(m, "# Version:1\n"); seq_puts(m, "# Format:\n"); seq_puts(m, "# <tree id> <ses id> <persistent fid> <flags> <count> <pid> <uid>"); #ifdef CONFIG_CIFS_DEBUG2 seq_puts(m, " <filename> <lease> <mid>\n"); #else seq_puts(m, " <filename> <lease>\n"); #endif /* CIFS_DEBUG2 */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { if (cifs_ses_exiting(ses)) continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); list_for_each_entry(cfile, &tcon->openFileList, tlist) { seq_printf(m, "0x%x 0x%llx 0x%llx 0x%x %d %d %d %pd", tcon->tid, ses->Suid, cfile->fid.persistent_fid, cfile->f_flags, cfile->count, cfile->pid, from_kuid(&init_user_ns, cfile->uid), cfile->dentry); /* Append lease/oplock caching state as RHW letters */ inode = d_inode(cfile->dentry); n = 0; if (inode) { cinode = CIFS_I(inode); if (CIFS_CACHE_READ(cinode)) lease[n++] = 'R'; if (CIFS_CACHE_HANDLE(cinode)) lease[n++] = 'H'; if (CIFS_CACHE_WRITE(cinode)) lease[n++] = 'W'; } lease[n] = '\0'; seq_puts(m, " "); if (n) seq_printf(m, "%s", lease); else seq_puts(m, "NONE"); #ifdef CONFIG_CIFS_DEBUG2 seq_printf(m, " %llu", cfile->fid.mid); #endif /* CONFIG_CIFS_DEBUG2 */ seq_printf(m, "\n"); } spin_unlock(&tcon->open_file_lock); } } } spin_unlock(&cifs_tcp_ses_lock); seq_putc(m, '\n'); return 0; } static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) { struct list_head *stmp, *tmp, *tmp1; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cached_fids *cfids; struct cached_fid *cfid; LIST_HEAD(entry); seq_puts(m, "# Version:1\n"); seq_puts(m, "# Format:\n"); seq_puts(m, "# <tree id> <sess id> <persistent fid> <path>\n"); spin_lock(&cifs_tcp_ses_lock); list_for_each(stmp, &cifs_tcp_ses_list) { server = list_entry(stmp, struct TCP_Server_Info, tcp_ses_list); list_for_each(tmp, &server->smb_ses_list) { ses = list_entry(tmp, struct cifs_ses, smb_ses_list); list_for_each(tmp1, &ses->tcon_list) { tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); cfids = tcon->cfids; if (!cfids) continue; spin_lock(&cfids->cfid_list_lock); /* check lock ordering */ seq_printf(m, "Num entries: %d, cached_dirents: %lu entries, %llu bytes\n", cfids->num_entries, (unsigned long)atomic_long_read(&cfids->total_dirents_entries), (unsigned long long)atomic64_read(&cfids->total_dirents_bytes)); list_for_each_entry(cfid, &cfids->entries, entry) { seq_printf(m, "0x%x 0x%llx 0x%llx %s", tcon->tid, ses->Suid, cfid->fid.persistent_fid, cfid->path); if (cfid->file_all_info_is_valid) seq_printf(m, "\tvalid file info"); if (cfid->dirents.is_valid) seq_printf(m, ", valid dirents"); if (!list_empty(&cfid->dirents.entries)) seq_printf(m, ", dirents: %lu entries, %lu bytes", cfid->dirents.entries_count, cfid->dirents.bytes_used); seq_printf(m, "\n"); } spin_unlock(&cfids->cfid_list_lock); } } } spin_unlock(&cifs_tcp_ses_lock); seq_putc(m, '\n'); return 0; } static __always_inline const char *compression_alg_str(__le16 alg) { switch (alg) { case SMB3_COMPRESS_NONE: return "NONE"; case SMB3_COMPRESS_LZNT1: return "LZNT1"; case SMB3_COMPRESS_LZ77: return "LZ77"; case SMB3_COMPRESS_LZ77_HUFF: return "LZ77-Huffman"; case SMB3_COMPRESS_PATTERN: return "Pattern_V1"; default: return "invalid"; } } static __always_inline const char *cipher_alg_str(__le16 cipher) { switch (cipher) { case SMB2_ENCRYPTION_AES128_CCM: return "AES128-CCM"; case SMB2_ENCRYPTION_AES128_GCM: return "AES128-GCM"; case SMB2_ENCRYPTION_AES256_CCM: return "AES256-CCM"; case SMB2_ENCRYPTION_AES256_GCM: return "AES256-GCM"; default: return "UNKNOWN"; } } static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { struct mid_q_entry *mid_entry; struct TCP_Server_Info *server; struct TCP_Server_Info *chan_server; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifs_server_iface *iface; size_t iface_weight = 0, iface_min_speed = 0; struct cifs_server_iface *last_iface = NULL; int c, i, j; seq_puts(m, "Display Internal CIFS Data Structures for Debugging\n" "---------------------------------------------------\n"); seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); seq_printf(m, "Features:"); #ifdef CONFIG_CIFS_DFS_UPCALL seq_printf(m, " DFS"); #endif #ifdef CONFIG_CIFS_FSCACHE seq_printf(m, ",FSCACHE"); #endif #ifdef CONFIG_CIFS_SMB_DIRECT seq_printf(m, ",SMB_DIRECT"); #endif #ifdef CONFIG_CIFS_STATS2 seq_printf(m, ",STATS2"); #else seq_printf(m, ",STATS"); #endif #ifdef CONFIG_CIFS_DEBUG2 seq_printf(m, ",DEBUG2"); #elif defined(CONFIG_CIFS_DEBUG) seq_printf(m, ",DEBUG"); #endif #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY seq_printf(m, ",ALLOW_INSECURE_LEGACY"); #endif #ifdef CONFIG_CIFS_POSIX seq_printf(m, ",CIFS_POSIX"); #endif #ifdef CONFIG_CIFS_UPCALL seq_printf(m, ",UPCALL(SPNEGO)"); #endif #ifdef CONFIG_CIFS_XATTR seq_printf(m, ",XATTR"); #endif seq_printf(m, ",ACL"); #ifdef CONFIG_CIFS_SWN_UPCALL seq_puts(m, ",WITNESS"); #endif #ifdef CONFIG_CIFS_COMPRESSION seq_puts(m, ",COMPRESSION"); #endif seq_putc(m, '\n'); seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); seq_printf(m, "\nServers: "); c = 0; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { #ifdef CONFIG_CIFS_SMB_DIRECT struct smbdirect_socket *sc; struct smbdirect_socket_parameters *sp; #endif /* channel info will be printed as a part of sessions below */ if (SERVER_IS_CHAN(server)) continue; c++; seq_printf(m, "\n%d) ConnectionId: 0x%llx ", c, server->conn_id); spin_lock(&server->srv_lock); if (server->hostname) seq_printf(m, "Hostname: %s ", server->hostname); seq_printf(m, "\nClientGUID: %pUL", server->client_guid); spin_unlock(&server->srv_lock); #ifdef CONFIG_CIFS_SMB_DIRECT if (!server->rdma) goto skip_rdma; if (!server->smbd_conn) { seq_printf(m, "\nSMBDirect transport not available"); goto skip_rdma; } sc = &server->smbd_conn->socket; sp = &sc->parameters; seq_printf(m, "\nSMBDirect protocol version: 0x%x " "transport status: %s (%u)", SMBDIRECT_V1, smbdirect_socket_status_string(sc->status), sc->status); seq_printf(m, "\nConn receive_credit_max: %u " "send_credit_target: %u max_send_size: %u", sp->recv_credit_max, sp->send_credit_target, sp->max_send_size); seq_printf(m, "\nConn max_fragmented_recv_size: %u " "max_fragmented_send_size: %u max_receive_size:%u", sp->max_fragmented_recv_size, sp->max_fragmented_send_size, sp->max_recv_size); seq_printf(m, "\nConn keep_alive_interval: %u " "max_readwrite_size: %u rdma_readwrite_threshold: %u", sp->keepalive_interval_msec * 1000, sp->max_read_write_size, server->rdma_readwrite_threshold); seq_printf(m, "\nDebug count_get_receive_buffer: %llu " "count_put_receive_buffer: %llu count_send_empty: %llu", sc->statistics.get_receive_buffer, sc->statistics.put_receive_buffer, sc->statistics.send_empty); seq_printf(m, "\nRead Queue " "count_enqueue_reassembly_queue: %llu " "count_dequeue_reassembly_queue: %llu " "reassembly_data_length: %u " "reassembly_queue_length: %u", sc->statistics.enqueue_reassembly_queue, sc->statistics.dequeue_reassembly_queue, sc->recv_io.reassembly.data_length, sc->recv_io.reassembly.queue_length); seq_printf(m, "\nCurrent Credits send_credits: %u " "receive_credits: %u receive_credit_target: %u", atomic_read(&sc->send_io.credits.count), atomic_read(&sc->recv_io.credits.count), sc->recv_io.credits.target); seq_printf(m, "\nPending send_pending: %u ", atomic_read(&sc->send_io.pending.count)); seq_printf(m, "\nMR responder_resources: %u " "max_frmr_depth: %u mr_type: 0x%x", sp->responder_resources, sp->max_frmr_depth, sc->mr_io.type); seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u", atomic_read(&sc->mr_io.ready.count), atomic_read(&sc->mr_io.used.count)); skip_rdma: #endif seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x", server->credits, server->echo_credits, server->oplock_credits, server->dialect); if (server->sign) seq_printf(m, " signed"); if (server->posix_ext_supported) seq_printf(m, " posix"); if (server->nosharesock) seq_printf(m, " nosharesock"); seq_printf(m, "\nServer capabilities: 0x%x", server->capabilities); if (server->rdma) seq_printf(m, "\nRDMA "); seq_printf(m, "\nTCP status: %d Instance: %d" "\nLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d", server->tcpStatus, server->reconnect_instance, server->srv_count, server->sec_mode, in_flight(server)); #ifdef CONFIG_NET_NS if (server->net) seq_printf(m, " Net namespace: %u ", server->net->ns.inum); #endif /* NET_NS */ seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d", atomic_read(&server->in_send), atomic_read(&server->num_waiters)); if (server->leaf_fullpath) { seq_printf(m, "\nDFS leaf full path: %s", server->leaf_fullpath); } seq_puts(m, "\nCompression: "); if (!IS_ENABLED(CONFIG_CIFS_COMPRESSION)) seq_puts(m, "no built-in support"); else if (!server->compression.requested) seq_puts(m, "disabled on mount"); else if (server->compression.enabled) seq_printf(m, "enabled (%s)", compression_alg_str(server->compression.alg)); else seq_puts(m, "disabled (not supported by this server)"); /* Show negotiated encryption cipher, even if not required */ seq_puts(m, "\nEncryption: "); if (server->cipher_type) seq_printf(m, "Negotiated cipher (%s)", cipher_alg_str(server->cipher_type)); seq_printf(m, "\n\n\tSessions: "); i = 0; list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { spin_lock(&ses->ses_lock); if (ses->ses_status == SES_EXITING) { spin_unlock(&ses->ses_lock); continue; } i++; if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || (ses->serverNOS == NULL)) { seq_printf(m, "\n\t%d) Address: %s Uses: %d Capability: 0x%x\tSession Status: %d ", i, ses->ip_addr, ses->ses_count, ses->capabilities, ses->ses_status); if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) seq_printf(m, "Guest "); else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) seq_printf(m, "Anonymous "); } else { seq_printf(m, "\n\t%d) Name: %s Domain: %s Uses: %d OS: %s " "\n\tNOS: %s\tCapability: 0x%x" "\n\tSMB session status: %d ", i, ses->ip_addr, ses->serverDomain, ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->ses_status); } if (ses->expired_pwd) seq_puts(m, "password no longer valid "); spin_unlock(&ses->ses_lock); seq_printf(m, "\n\tSecurity type: %s ", get_security_type_str(server->ops->select_sectype(server, ses->sectype))); /* dump session id helpful for use with network trace */ seq_printf(m, " SessionId: 0x%llx", ses->Suid); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) seq_puts(m, " encrypted"); if (ses->sign) seq_puts(m, " signed"); seq_printf(m, "\n\tUser: %d Cred User: %d", from_kuid(&init_user_ns, ses->linux_uid), from_kuid(&init_user_ns, ses->cred_uid)); if (ses->dfs_root_ses) { seq_printf(m, "\n\tDFS root session id: 0x%llx", ses->dfs_root_ses->Suid); } spin_lock(&ses->chan_lock); if (CIFS_CHAN_NEEDS_RECONNECT(ses, 0)) seq_puts(m, "\tPrimary channel: DISCONNECTED "); if (CIFS_CHAN_IN_RECONNECT(ses, 0)) seq_puts(m, "\t[RECONNECTING] "); if (ses->chan_count > 1) { seq_printf(m, "\n\n\tExtra Channels: %zu ", ses->chan_count-1); for (j = 1; j < ses->chan_count; j++) { cifs_dump_channel(m, j, &ses->chans[j]); if (CIFS_CHAN_NEEDS_RECONNECT(ses, j)) seq_puts(m, "\tDISCONNECTED "); if (CIFS_CHAN_IN_RECONNECT(ses, j)) seq_puts(m, "\t[RECONNECTING] "); } } spin_unlock(&ses->chan_lock); seq_puts(m, "\n\n\tShares: "); j = 0; seq_printf(m, "\n\t%d) IPC: ", j); if (ses->tcon_ipc) cifs_debug_tcon(m, ses->tcon_ipc); else seq_puts(m, "none\n"); list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { ++j; seq_printf(m, "\n\t%d) ", j); cifs_debug_tcon(m, tcon); } spin_lock(&ses->iface_lock); if (ses->iface_count) seq_printf(m, "\n\n\tServer interfaces: %zu" "\tLast updated: %lu seconds ago", ses->iface_count, (jiffies - ses->iface_last_update) / HZ); last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface, iface_head); iface_min_speed = last_iface->speed; j = 0; list_for_each_entry(iface, &ses->iface_list, iface_head) { seq_printf(m, "\n\t%d)", ++j); cifs_dump_iface(m, iface); iface_weight = iface->speed / iface_min_speed; seq_printf(m, "\t\tWeight (cur,total): (%zu,%zu)" "\n\t\tAllocated channels: %u\n", iface->weight_fulfilled, iface_weight, iface->num_channels); if (is_ses_using_iface(ses, iface)) seq_puts(m, "\t\t[CONNECTED]\n"); } spin_unlock(&ses->iface_lock); seq_puts(m, "\n\n\tMIDs: "); spin_lock(&ses->chan_lock); for (j = 0; j < ses->chan_count; j++) { chan_server = ses->chans[j].server; if (!chan_server) continue; if (list_empty(&chan_server->pending_mid_q)) continue; seq_printf(m, "\n\tServer ConnectionId: 0x%llx", chan_server->conn_id); spin_lock(&chan_server->mid_queue_lock); list_for_each_entry(mid_entry, &chan_server->pending_mid_q, qhead) { seq_printf(m, "\n\t\tState: %d com: %d pid: %d cbdata: %p mid %llu", mid_entry->mid_state, le16_to_cpu(mid_entry->command), mid_entry->pid, mid_entry->callback_data, mid_entry->mid); } spin_unlock(&chan_server->mid_queue_lock); } spin_unlock(&ses->chan_lock); seq_puts(m, "\n--\n"); } if (i == 0) seq_printf(m, "\n\t\t[NONE]"); } if (c == 0) seq_printf(m, "\n\t[NONE]"); spin_unlock(&cifs_tcp_ses_lock); seq_putc(m, '\n'); cifs_swn_dump(m); /* BB add code to dump additional info such as TCP session info now */ return 0; } static ssize_t cifs_stats_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { bool bv; int rc; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; rc = kstrtobool_from_user(buffer, count, &bv); if (rc == 0) { #ifdef CONFIG_CIFS_STATS2 int i; atomic_set(&total_buf_alloc_count, 0); atomic_set(&total_small_buf_alloc_count, 0); #endif /* CONFIG_CIFS_STATS2 */ atomic_set(&tcpSesReconnectCount, 0); atomic_set(&tconInfoReconnectCount, 0); spin_lock(&GlobalMid_Lock); GlobalMaxActiveXid = 0; GlobalCurrentXid = 0; spin_unlock(&GlobalMid_Lock); spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { server->max_in_flight = 0; #ifdef CONFIG_CIFS_STATS2 for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++) { atomic_set(&server->num_cmds[i], 0); atomic_set(&server->smb2slowcmd[i], 0); server->time_per_cmd[i] = 0; server->slowest_cmd[i] = 0; server->fastest_cmd[0] = 0; } #endif /* CONFIG_CIFS_STATS2 */ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { if (cifs_ses_exiting(ses)) continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { atomic_set(&tcon->num_smbs_sent, 0); spin_lock(&tcon->stat_lock); tcon->bytes_read = 0; tcon->bytes_written = 0; tcon->stats_from_time = ktime_get_real_seconds(); spin_unlock(&tcon->stat_lock); if (server->ops->clear_stats) server->ops->clear_stats(tcon); } } } spin_unlock(&cifs_tcp_ses_lock); } else { return rc; } return count; } static int cifs_stats_proc_show(struct seq_file *m, void *v) { int i; #ifdef CONFIG_CIFS_STATS2 int j; #endif /* STATS2 */ struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; seq_printf(m, "Resources in use\nCIFS Session: %d\n", sesInfoAllocCount.counter); seq_printf(m, "Share (unique mount targets): %d\n", tconInfoAllocCount.counter); seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n", buf_alloc_count.counter, cifs_min_rcv + tcpSesAllocCount.counter); seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n", small_buf_alloc_count.counter, cifs_min_small); #ifdef CONFIG_CIFS_STATS2 seq_printf(m, "Total Large %d Small %d Allocations\n", atomic_read(&total_buf_alloc_count), atomic_read(&total_small_buf_alloc_count)); #endif /* CONFIG_CIFS_STATS2 */ seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&mid_count)); seq_printf(m, "\n%d session %d share reconnects\n", tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); seq_printf(m, "Total vfs operations: %d maximum at one time: %d\n", GlobalCurrentXid, GlobalMaxActiveXid); i = 0; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { seq_printf(m, "\nMax requests in flight: %d", server->max_in_flight); #ifdef CONFIG_CIFS_STATS2 seq_puts(m, "\nTotal time spent processing by command. Time "); seq_printf(m, "units are jiffies (%d per second)\n", HZ); seq_puts(m, " SMB3 CMD\tNumber\tTotal Time\tFastest\tSlowest\n"); seq_puts(m, " --------\t------\t----------\t-------\t-------\n"); for (j = 0; j < NUMBER_OF_SMB2_COMMANDS; j++) seq_printf(m, " %d\t\t%d\t%llu\t\t%u\t%u\n", j, atomic_read(&server->num_cmds[j]), server->time_per_cmd[j], server->fastest_cmd[j], server->slowest_cmd[j]); for (j = 0; j < NUMBER_OF_SMB2_COMMANDS; j++) if (atomic_read(&server->smb2slowcmd[j])) { spin_lock(&server->srv_lock); seq_printf(m, " %d slow responses from %s for command %d\n", atomic_read(&server->smb2slowcmd[j]), server->hostname, j); spin_unlock(&server->srv_lock); } #endif /* STATS2 */ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { if (cifs_ses_exiting(ses)) continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { i++; seq_printf(m, "\n%d) %s", i, tcon->tree_name); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); seq_printf(m, "\nSMBs: %d since %ptTs UTC", atomic_read(&tcon->num_smbs_sent), &tcon->stats_from_time); if (server->ops->print_stats) server->ops->print_stats(m, tcon); } } } spin_unlock(&cifs_tcp_ses_lock); seq_putc(m, '\n'); return 0; } static int cifs_stats_proc_open(struct inode *inode, struct file *file) { return single_open(file, cifs_stats_proc_show, NULL); } static const struct proc_ops cifs_stats_proc_ops = { .proc_open = cifs_stats_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = cifs_stats_proc_write, }; #ifdef CONFIG_CIFS_SMB_DIRECT #define PROC_FILE_DEFINE(name) \ static ssize_t name##_write(struct file *file, const char __user *buffer, \ size_t count, loff_t *ppos) \ { \ int rc; \ rc = kstrtoint_from_user(buffer, count, 10, &name); \ if (rc) \ return rc; \ return count; \ } \ static int name##_proc_show(struct seq_file *m, void *v) \ { \ seq_printf(m, "%d\n", name); \ return 0; \ } \ static int name##_open(struct inode *inode, struct file *file) \ { \ return single_open(file, name##_proc_show, NULL); \ } \ \ static const struct proc_ops cifs_##name##_proc_fops = { \ .proc_open = name##_open, \ .proc_read = seq_read, \ .proc_lseek = seq_lseek, \ .proc_release = single_release, \ .proc_write = name##_write, \ } PROC_FILE_DEFINE(rdma_readwrite_threshold); PROC_FILE_DEFINE(smbd_max_frmr_depth); PROC_FILE_DEFINE(smbd_keep_alive_interval); PROC_FILE_DEFINE(smbd_max_receive_size); PROC_FILE_DEFINE(smbd_max_fragmented_recv_size); PROC_FILE_DEFINE(smbd_max_send_size); PROC_FILE_DEFINE(smbd_send_credit_target); PROC_FILE_DEFINE(smbd_receive_credit_max); #endif static struct proc_dir_entry *proc_fs_cifs; static const struct proc_ops cifsFYI_proc_ops; static const struct proc_ops cifs_lookup_cache_proc_ops; static const struct proc_ops traceSMB_proc_ops; static const struct proc_ops cifs_security_flags_proc_ops; static const struct proc_ops cifs_linux_ext_proc_ops; static const struct proc_ops cifs_mount_params_proc_ops; void cifs_proc_init(void) { proc_fs_cifs = proc_mkdir("fs/cifs", NULL); if (proc_fs_cifs == NULL) return; proc_create_single("DebugData", 0, proc_fs_cifs, cifs_debug_data_proc_show); proc_create_single("open_files", 0400, proc_fs_cifs, cifs_debug_files_proc_show); proc_create_single("open_dirs", 0400, proc_fs_cifs, cifs_debug_dirs_proc_show); proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_ops); proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_ops); proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_ops); proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs, &cifs_linux_ext_proc_ops); proc_create("SecurityFlags", 0644, proc_fs_cifs, &cifs_security_flags_proc_ops); proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, &cifs_lookup_cache_proc_ops); proc_create("mount_params", 0444, proc_fs_cifs, &cifs_mount_params_proc_ops); #ifdef CONFIG_CIFS_DFS_UPCALL proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_ops); #endif #ifdef CONFIG_CIFS_SMB_DIRECT proc_create("rdma_readwrite_threshold", 0644, proc_fs_cifs, &cifs_rdma_readwrite_threshold_proc_fops); proc_create("smbd_max_frmr_depth", 0644, proc_fs_cifs, &cifs_smbd_max_frmr_depth_proc_fops); proc_create("smbd_keep_alive_interval", 0644, proc_fs_cifs, &cifs_smbd_keep_alive_interval_proc_fops); proc_create("smbd_max_receive_size", 0644, proc_fs_cifs, &cifs_smbd_max_receive_size_proc_fops); proc_create("smbd_max_fragmented_recv_size", 0644, proc_fs_cifs, &cifs_smbd_max_fragmented_recv_size_proc_fops); proc_create("smbd_max_send_size", 0644, proc_fs_cifs, &cifs_smbd_max_send_size_proc_fops); proc_create("smbd_send_credit_target", 0644, proc_fs_cifs, &cifs_smbd_send_credit_target_proc_fops); proc_create("smbd_receive_credit_max", 0644, proc_fs_cifs, &cifs_smbd_receive_credit_max_proc_fops); #endif } void cifs_proc_clean(void) { if (proc_fs_cifs == NULL) return; remove_proc_entry("DebugData", proc_fs_cifs); remove_proc_entry("open_files", proc_fs_cifs); remove_proc_entry("open_dirs", proc_fs_cifs); remove_proc_entry("cifsFYI", proc_fs_cifs); remove_proc_entry("traceSMB", proc_fs_cifs); remove_proc_entry("Stats", proc_fs_cifs); remove_proc_entry("SecurityFlags", proc_fs_cifs); remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); remove_proc_entry("mount_params", proc_fs_cifs); #ifdef CONFIG_CIFS_DFS_UPCALL remove_proc_entry("dfscache", proc_fs_cifs); #endif #ifdef CONFIG_CIFS_SMB_DIRECT remove_proc_entry("rdma_readwrite_threshold", proc_fs_cifs); remove_proc_entry("smbd_max_frmr_depth", proc_fs_cifs); remove_proc_entry("smbd_keep_alive_interval", proc_fs_cifs); remove_proc_entry("smbd_max_receive_size", proc_fs_cifs); remove_proc_entry("smbd_max_fragmented_recv_size", proc_fs_cifs); remove_proc_entry("smbd_max_send_size", proc_fs_cifs); remove_proc_entry("smbd_send_credit_target", proc_fs_cifs); remove_proc_entry("smbd_receive_credit_max", proc_fs_cifs); #endif remove_proc_entry("fs/cifs", NULL); } static int cifsFYI_proc_show(struct seq_file *m, void *v) { seq_printf(m, "%d\n", cifsFYI); return 0; } static int cifsFYI_proc_open(struct inode *inode, struct file *file) { return single_open(file, cifsFYI_proc_show, NULL); } static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char c[2] = { '\0' }; bool bv; int rc; rc = get_user(c[0], buffer); if (rc) return rc; if (kstrtobool(c, &bv) == 0) cifsFYI = bv; else if ((c[0] > '1') && (c[0] <= '9')) cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */ else return -EINVAL; return count; } static const struct proc_ops cifsFYI_proc_ops = { .proc_open = cifsFYI_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = cifsFYI_proc_write, }; static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) { seq_printf(m, "%d\n", linuxExtEnabled); return 0; } static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file) { return single_open(file, cifs_linux_ext_proc_show, NULL); } static ssize_t cifs_linux_ext_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { int rc; rc = kstrtobool_from_user(buffer, count, &linuxExtEnabled); if (rc) return rc; return count; } static const struct proc_ops cifs_linux_ext_proc_ops = { .proc_open = cifs_linux_ext_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = cifs_linux_ext_proc_write, }; static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v) { seq_printf(m, "%d\n", lookupCacheEnabled); return 0; } static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file) { return single_open(file, cifs_lookup_cache_proc_show, NULL); } static ssize_t cifs_lookup_cache_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { int rc; rc = kstrtobool_from_user(buffer, count, &lookupCacheEnabled); if (rc) return rc; return count; } static const struct proc_ops cifs_lookup_cache_proc_ops = { .proc_open = cifs_lookup_cache_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = cifs_lookup_cache_proc_write, }; static int traceSMB_proc_show(struct seq_file *m, void *v) { seq_printf(m, "%d\n", traceSMB); return 0; } static int traceSMB_proc_open(struct inode *inode, struct file *file) { return single_open(file, traceSMB_proc_show, NULL); } static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { int rc; rc = kstrtobool_from_user(buffer, count, &traceSMB); if (rc) return rc; return count; } static const struct proc_ops traceSMB_proc_ops = { .proc_open = traceSMB_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = traceSMB_proc_write, }; static int cifs_security_flags_proc_show(struct seq_file *m, void *v) { seq_printf(m, "0x%x\n", global_secflags); return 0; } static int cifs_security_flags_proc_open(struct inode *inode, struct file *file) { return single_open(file, cifs_security_flags_proc_show, NULL); } /* * Ensure that if someone sets a MUST flag, that we disable all other MAY * flags except for the ones corresponding to the given MUST flag. If there are * multiple MUST flags, then try to prefer more secure ones. */ static void cifs_security_flags_handle_must_flags(unsigned int *flags) { unsigned int signflags = *flags & (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL); if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) *flags = CIFSSEC_MUST_KRB5; else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) *flags = CIFSSEC_MUST_NTLMSSP; else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) *flags = CIFSSEC_MUST_NTLMV2; *flags |= signflags; } static ssize_t cifs_security_flags_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { int rc; unsigned int flags; char flags_string[12]; bool bv; if ((count < 1) || (count > 11)) return -EINVAL; memset(flags_string, 0, sizeof(flags_string)); if (copy_from_user(flags_string, buffer, count)) return -EFAULT; if (count < 3) { /* single char or single char followed by null */ if (kstrtobool(flags_string, &bv) == 0) { global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF; return count; } else if (!isdigit(flags_string[0])) { cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string); return -EINVAL; } } /* else we have a number */ rc = kstrtouint(flags_string, 0, &flags); if (rc) { cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string); return rc; } cifs_dbg(FYI, "sec flags 0x%x\n", flags); if (flags == 0) { cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string); return -EINVAL; } if (flags & ~CIFSSEC_MASK) { cifs_dbg(VFS, "Unsupported security flags: 0x%x\n", flags & ~CIFSSEC_MASK); return -EINVAL; } cifs_security_flags_handle_must_flags(&flags); /* flags look ok - update the global security flags for cifs module */ global_secflags = flags; if (global_secflags & CIFSSEC_MUST_SIGN) { /* requiring signing implies signing is allowed */ global_secflags |= CIFSSEC_MAY_SIGN; cifs_dbg(FYI, "packet signing now required\n"); } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) { cifs_dbg(FYI, "packet signing disabled\n"); } /* BB should we turn on MAY flags for other MUST options? */ return count; } static const struct proc_ops cifs_security_flags_proc_ops = { .proc_open = cifs_security_flags_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = cifs_security_flags_proc_write, }; /* To make it easier to debug, can help to show mount params */ static int cifs_mount_params_proc_show(struct seq_file *m, void *v) { const struct fs_parameter_spec *p; const char *type; for (p = smb3_fs_parameters; p->name; p++) { /* cannot use switch with pointers... */ if (!p->type) { if (p->flags == fs_param_neg_with_no) type = "noflag"; else type = "flag"; } else if (p->type == fs_param_is_bool) type = "bool"; else if (p->type == fs_param_is_u32) type = "u32"; else if (p->type == fs_param_is_u64) type = "u64"; else if (p->type == fs_param_is_string) type = "string"; else type = "unknown"; seq_printf(m, "%s:%s\n", p->name, type); } return 0; } static int cifs_mount_params_proc_open(struct inode *inode, struct file *file) { return single_open(file, cifs_mount_params_proc_show, NULL); } static const struct proc_ops cifs_mount_params_proc_ops = { .proc_open = cifs_mount_params_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, /* No need for write for now */ /* .proc_write = cifs_mount_params_proc_write, */ }; #else inline void cifs_proc_init(void) { } inline void cifs_proc_clean(void) { } #endif /* PROC_FS */ |
| 260 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SMT_H #define _LINUX_SCHED_SMT_H #include <linux/static_key.h> #ifdef CONFIG_SCHED_SMT extern struct static_key_false sched_smt_present; static __always_inline bool sched_smt_active(void) { return static_branch_likely(&sched_smt_present); } #else static __always_inline bool sched_smt_active(void) { return false; } #endif void arch_smt_update(void); #endif /* _LINUX_SCHED_SMT_H */ |
| 2090 2088 4981 4988 277 277 3 5095 278 4990 313 7 7 7 7 509 509 509 509 5096 5086 5088 7 835 9260 416 418 418 102 418 102 2534 6194 1777 1779 385 2496 649 1276 147 1276 1277 287 21 710 1559 956 1154 269 737 805 355 745 4426 4489 1103 385 233 384 385 385 385 384 417 417 224 748 748 749 747 273 749 317 235 328 7513 7526 7528 7511 7528 423 632 631 518 878 4425 4426 337 297 165 338 4741 4741 4403 3835 4741 129 129 124 94 93 339 1010 508 65 1352 947 947 947 947 947 911 909 909 909 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* memcontrol.h - Memory Controller * * Copyright IBM Corporation, 2007 * Author Balbir Singh <balbir@linux.vnet.ibm.com> * * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <xemul@openvz.org> */ #ifndef _LINUX_MEMCONTROL_H #define _LINUX_MEMCONTROL_H #include <linux/cgroup.h> #include <linux/vm_event_item.h> #include <linux/hardirq.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/page_counter.h> #include <linux/vmpressure.h> #include <linux/eventfd.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/writeback.h> #include <linux/page-flags.h> #include <linux/shrinker.h> struct mem_cgroup; struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, MEMCG_NR_STAT, }; enum memcg_memory_event { MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, MEMCG_OOM_GROUP_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int generation; }; #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 struct mem_cgroup_id { int id; refcount_t ref; }; struct memcg_vmstats_percpu; struct memcg1_events_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; struct lruvec_stats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; /* scan generation, increased every round-trip */ atomic_t generation; }; /* * per-node information in memory controller. */ struct mem_cgroup_per_node { /* Keep the read-only fields at the start */ struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; struct lruvec_stats *lruvec_stats; struct shrinker_info __rcu *shrinker_info; #ifdef CONFIG_MEMCG_V1 /* * Memcg-v1 only stuff in middle as buffer between read mostly fields * and update often fields to avoid false sharing. If v1 stuff is * not present, an explicit padding is needed. */ struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; #else CACHELINE_PADDING(_pad1_); #endif /* Fields which get updated often at the end. */ struct lruvec lruvec; CACHELINE_PADDING(_pad2_); unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC /* slab stats for nmi context */ atomic_t slab_reclaimable; atomic_t slab_unreclaimable; #endif }; struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; unsigned long threshold; }; /* For threshold */ struct mem_cgroup_threshold_ary { /* An array index points to threshold just below or equal to usage. */ int current_threshold; /* Size of entries[] */ unsigned int size; /* Array of thresholds */ struct mem_cgroup_threshold entries[] __counted_by(size); }; struct mem_cgroup_thresholds { /* Primary thresholds array */ struct mem_cgroup_threshold_ary *primary; /* * Spare threshold array. * This is needed to make mem_cgroup_unregister_event() "never fail". * It must be able to store at least primary->size - 1 entries. */ struct mem_cgroup_threshold_ary *spare; }; /* * Remember four most recent foreign writebacks with dirty pages in this * cgroup. Inode sharing is expected to be uncommon and, even if we miss * one in a given round, we're likely to catch it later if it keeps * foreign-dirtying, so a fairly low count should be enough. * * See mem_cgroup_track_foreign_dirty_slowpath() for details. */ #define MEMCG_CGWB_FRN_CNT 4 struct memcg_cgwb_frn { u64 bdi_id; /* bdi->id of the foreign inode */ int memcg_id; /* memcg->css.id of foreign inode */ u64 at; /* jiffies_64 at the time of dirtying */ struct wb_completion done; /* tracks in-flight foreign writebacks */ }; /* * Bucket for arbitrarily byte-sized objects charged to a memory * cgroup. The bucket can be reparented in one piece when the cgroup * is destroyed, without having to round up the individual references * of all live memory objects in the wild. */ struct obj_cgroup { struct percpu_ref refcnt; struct mem_cgroup *memcg; atomic_t nr_charged_bytes; union { struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; }; /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. */ struct mem_cgroup { struct cgroup_subsys_state css; /* Private memcg ID. Used to ID objects that outlive the cgroup */ struct mem_cgroup_id id; /* Accounted resources */ struct page_counter memory; /* Both v1 & v2 */ union { struct page_counter swap; /* v2 only */ struct page_counter memsw; /* v1 only */ }; /* registered local peak watchers */ struct list_head memory_peaks; struct list_head swap_peaks; spinlock_t peaks_lock; /* Range enforcement for interrupt charges */ struct work_struct high_work; #ifdef CONFIG_ZSWAP unsigned long zswap_max; /* * Prevent pages from this memcg from being written back from zswap to * swap, and from being swapped out on zswap store failures. */ bool zswap_writeback; #endif /* vmpressure notifications */ struct vmpressure vmpressure; /* * Should the OOM killer kill all belonging tasks, had it kill one? */ bool oom_group; int swappiness; /* memory.events and memory.events.local */ struct cgroup_file events_file; struct cgroup_file events_local_file; /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; /* memory.stat */ struct memcg_vmstats *vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC /* MEMCG_KMEM for nmi context */ atomic_t kmem_stat; #endif /* * Hint of reclaim pressure for socket memroy management. Note * that this indicator should NOT be used in legacy cgroup mode * where socket memory is accounted/charged separately. */ u64 socket_pressure; #if BITS_PER_LONG < 64 seqlock_t socket_pressure_seqlock; #endif int kmemcg_id; /* * memcg->objcg is wiped out as a part of the objcg repaprenting * process. memcg->orig_objcg preserves a pointer (and a reference) * to the original objcg until the end of live of memcg. */ struct obj_cgroup __rcu *objcg; struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif #ifdef CONFIG_MEMCG_V1 /* Legacy consumer-oriented counters */ struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ struct memcg1_events_percpu __percpu *events_percpu; unsigned long soft_limit; /* protected by memcg_oom_lock */ bool oom_lock; int under_oom; /* OOM-Killer disable */ int oom_kill_disable; /* protect arrays of thresholds */ struct mutex thresholds_lock; /* thresholds for memory usage. RCU-protected */ struct mem_cgroup_thresholds thresholds; /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; /* For oom notifier event fd */ struct list_head oom_notify; /* Legacy tcp memory accounting */ bool tcpmem_active; int tcpmem_pressure; /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; #endif /* CONFIG_MEMCG_V1 */ struct mem_cgroup_per_node *nodeinfo[]; }; /* * size of first charge trial. * TODO: maybe necessary to use big numbers in big irons or dynamic based of the * workload. */ #define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an slabobj_ext vector */ MEMCG_DATA_OBJEXTS = (1UL << 0), /* page has been accounted as a non-slab kernel page */ MEMCG_DATA_KMEM = (1UL << 1), /* the next bit after the last actual flag */ __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; #define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ #define __OBJEXTS_ALLOC_FAIL (1UL << 0) #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { /* * Use bit 0 with zero other bits to signal that slabobj_ext vector * failed to allocate. The same bit 0 with valid upper bits means * MEMCG_DATA_OBJEXTS. */ OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, /* slabobj_ext vector allocated with kmalloc_nolock() */ OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; #define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) #ifdef CONFIG_MEMCG static inline bool folio_memcg_kmem(struct folio *folio); /* * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. * * The caller must ensure that the returned memcg won't be released. */ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) { lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex)); return READ_ONCE(objcg->memcg); } /* * __folio_memcg - Get the memory cgroup associated with a non-kmem folio * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios or * kmem folios. */ static inline struct mem_cgroup *__folio_memcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* * __folio_objcg - get the object cgroup associated with a kmem folio. * @folio: Pointer to the folio. * * Returns a pointer to the object cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper object cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios or * LRU folios. */ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* * folio_memcg - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * * - the folio lock * - LRU isolation * - exclusive reference * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. */ static inline struct mem_cgroup *folio_memcg(struct folio *folio) { if (folio_memcg_kmem(folio)) return obj_cgroup_memcg(__folio_objcg(folio)); return __folio_memcg(folio); } /* * folio_memcg_charged - If a folio is charged to a memory cgroup. * @folio: Pointer to the folio. * * Returns true if folio is charged to a memory cgroup, otherwise returns false. */ static inline bool folio_memcg_charged(struct folio *folio) { return folio->memcg_data != 0; } /* * folio_memcg_check - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function unlike folio_memcg() can take any folio * as an argument. It has to be used in cases when it's not known if a folio * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * * - the folio lock * - LRU isolation * - exclusive reference * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. */ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { /* * Because folio->memcg_data might be changed asynchronously * for slabs, READ_ONCE() should be used here. */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); if (memcg_data & MEMCG_DATA_OBJEXTS) return NULL; if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); return obj_cgroup_memcg(objcg); } return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } static inline struct mem_cgroup *page_memcg_check(struct page *page) { if (PageTail(page)) return NULL; return folio_memcg_check((struct folio *)page); } static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { struct mem_cgroup *memcg; rcu_read_lock(); retry: memcg = obj_cgroup_memcg(objcg); if (unlikely(!css_tryget(&memcg->css))) goto retry; rcu_read_unlock(); return memcg; } /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. * @folio: Pointer to the folio. * * Checks if the folio has MemcgKmem flag set. The caller must ensure * that the folio has an associated memory cgroup. It's not safe to call * this function against some types of folios, e.g. slab folios. */ static inline bool folio_memcg_kmem(struct folio *folio) { VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio); return folio->memcg_data & MEMCG_DATA_KMEM; } static inline bool PageMemcgKmem(struct page *page) { return folio_memcg_kmem(page_folio(page)); } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); } static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; if (mem_cgroup_disabled()) return; /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because * mem_cgroup_calculate_protection is not robust enough to keep * the protection invariant for calculated effective values for * parallel reclaimers with different reclaim target. This is * especially a problem for tail memcgs (as they have pages on LRU) * which would want to have effective values 0 for targeted reclaim * but a different value for external reclaim. * * Example * Let's have global and A's reclaim in parallel: * | * A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G) * |\ * | C (low = 1G, usage = 2.5G) * B (low = 1G, usage = 0.5G) * * For the global reclaim * A.elow = A.low * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow * C.elow = min(C.usage, C.low) * * With the effective values resetting we have A reclaim * A.elow = 0 * B.elow = B.low * C.elow = C.low * * If the global reclaim races with A's reclaim then * B.elow = C.elow = 0 because children_low_usage > A.elow) * is possible and reclaiming B would be violating the protection. * */ if (root == memcg) return; *min = READ_ONCE(memcg->memory.emin); *low = READ_ONCE(memcg->memory.elow); } void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg); static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, struct mem_cgroup *memcg) { /* * The root memcg doesn't account charges, and doesn't support * protection. The target memcg's protection is ignored, see * mem_cgroup_calculate_protection() and mem_cgroup_protection() */ return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) || memcg == target; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.emin) >= page_counter_read(&memcg->memory); } int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** * mem_cgroup_charge - Charge a newly allocated folio to a cgroup. * @folio: Folio to charge. * @mm: mm context of the allocating task. * @gfp: Reclaim mode. * * Try to charge @folio to the memcg that @mm belongs to, reclaiming * pages according to @gfp if necessary. If @mm is NULL, try to * charge to the active memcg. * * Do not use this for folios allocated for swapin. * * Return: 0 on success. Otherwise, an error code is returned. */ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { if (mem_cgroup_disabled()) return 0; return __mem_cgroup_charge(folio, mm, gfp); } int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void __mem_cgroup_uncharge(struct folio *folio); /** * mem_cgroup_uncharge - Uncharge a folio. * @folio: Folio to uncharge. * * Uncharge a folio previously charged with mem_cgroup_charge(). */ static inline void mem_cgroup_uncharge(struct folio *folio) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge(folio); } void __mem_cgroup_uncharge_folios(struct folio_batch *folios); static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge_folios(folios); } void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec * @pgdat: pglist_data * * Returns the lru list vector holding pages for a given @memcg & * @pgdat combination. This can be the node lruvec, if the memory * controller is disabled. */ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { struct mem_cgroup_per_node *mz; struct lruvec *lruvec; if (mem_cgroup_disabled()) { lruvec = &pgdat->__lruvec; goto out; } if (!memcg) memcg = root_mem_cgroup; mz = memcg->nodeinfo[pgdat->node_id]; lruvec = &mz->lruvec; out: /* * Since a node can be onlined after the mem_cgroup was created, * we have to be prepared to initialize lruvec->pgdat here; * and if offlined then reonlined, we need to reinitialize it. */ if (unlikely(lruvec->pgdat != pgdat)) lruvec->pgdat = pgdat; return lruvec; } /** * folio_lruvec - return lruvec for isolating/putting an LRU folio * @folio: Pointer to the folio. * * This function relies on folio->mem_cgroup being stable. */ static inline struct lruvec *folio_lruvec(struct folio *folio) { struct mem_cgroup *memcg = folio_memcg(folio); VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio); return mem_cgroup_lruvec(memcg, folio_pgdat(folio)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_current(void); struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio); struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); #ifdef CONFIG_DEBUG_VM void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); #else static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } #endif static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg) { return percpu_ref_tryget(&objcg->refcnt); } static inline void obj_cgroup_get(struct obj_cgroup *objcg) { percpu_ref_get(&objcg->refcnt); } static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, unsigned long nr) { percpu_ref_get_many(&objcg->refcnt, nr); } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { if (objcg) percpu_ref_put(&objcg->refcnt); } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) { return !memcg || css_tryget(&memcg->css); } static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) { return !memcg || css_tryget_online(&memcg->css); } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) css_put(&memcg->css); } #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*)(struct task_struct *, void *), void *arg); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; return memcg->id.id; } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); #endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return mem_cgroup_from_css(seq_css(m)); } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { struct mem_cgroup_per_node *mz; if (mem_cgroup_disabled()) return NULL; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->memcg; } /** * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find * * Returns the parent memcg, or NULL if this is the root. */ static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return mem_cgroup_from_css(memcg->css.parent); } static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { if (root == memcg) return true; return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { struct mem_cgroup *task_memcg; bool match = false; rcu_read_lock(); task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (task_memcg) match = mem_cgroup_is_descendant(task_memcg, memcg); rcu_read_unlock(); return match; } struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return true; return !!(memcg->css.flags & CSS_ONLINE); } void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { struct mem_cgroup_per_node *mz; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } void __mem_cgroup_handle_over_high(gfp_t gfp_mask); static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { if (unlikely(current->memcg_nr_pages_over_high)) __mem_cgroup_handle_over_high(gfp_mask); } unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); unsigned long mem_cgroup_size(struct mem_cgroup *memcg); void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); /* idx can be of type enum memcg_stat_item or node_stat_item */ void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val); static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = folio_memcg(page_folio(page)); if (memcg) mod_memcg_state(memcg, idx, val); rcu_read_unlock(); } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); void mem_cgroup_flush_stats(struct mem_cgroup *memcg); void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_kmem_state(p, idx, val); local_irq_restore(flags); } void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { struct mem_cgroup *memcg = folio_memcg(folio); if (memcg) count_memcg_events(memcg, idx, nr); } static inline void count_memcg_events_mm(struct mm_struct *mm, enum vm_event_item idx, unsigned long count) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) count_memcg_events(memcg, idx, count); rcu_read_unlock(); } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { count_memcg_events_mm(mm, idx, 1); } static inline void __memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event, bool allow_spinning) { bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || event == MEMCG_SWAP_FAIL; /* For now only MEMCG_MAX can happen with !allow_spinning context. */ VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); atomic_long_inc(&memcg->memory_events_local[event]); if (!swap_event && allow_spinning) cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); if (allow_spinning) { if (swap_event) cgroup_file_notify(&memcg->swap_events_file); else cgroup_file_notify(&memcg->events_file); } if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { __memcg_memory_event(memcg, event, true); } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) memcg_memory_event(memcg, event); rcu_read_unlock(); } void split_page_memcg(struct page *first, unsigned order); void folio_split_memcg_refs(struct folio *folio, unsigned old_order, unsigned new_order); static inline u64 cgroup_id_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg; u64 id; if (mem_cgroup_disabled()) return 0; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (!memcg) memcg = root_mem_cgroup; id = cgroup_id(memcg->css.cgroup); rcu_read_unlock(); return id; } extern int mem_cgroup_init(void); #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 #define root_mem_cgroup (NULL) static inline struct mem_cgroup *folio_memcg(struct folio *folio) { return NULL; } static inline bool folio_memcg_charged(struct folio *folio) { return false; } static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { return NULL; } static inline struct mem_cgroup *page_memcg_check(struct page *page) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { return NULL; } static inline bool folio_memcg_kmem(struct folio *folio) { return false; } static inline bool PageMemcgKmem(struct page *page) { return false; } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_disabled(void) { return true; } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg) { } static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { return false; } static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { return false; } static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { return 0; } static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) { return 0; } static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { return 0; } static inline void mem_cgroup_uncharge(struct folio *folio) { } static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { } static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); return &pgdat->__lruvec; } static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { return true; } static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_current(void) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) { return NULL; } static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) { return NULL; } static inline void obj_cgroup_get(struct obj_cgroup *objcg) { } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) { return true; } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flagsp) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; } static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { return NULL; } static inline void mem_cgroup_iter_break(struct mem_cgroup *root, struct mem_cgroup *prev) { } static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) { } static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(id); /* XXX: This should always return root_mem_cgroup */ return NULL; } #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } #endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return NULL; } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { return NULL; } static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; } static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { return 0; } static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) { return 0; } static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { } static inline void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { return NULL; } static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int nr) { } static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { } static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { } static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); __mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); mod_node_page_state(page_pgdat(page), idx, val); } static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { } static inline void count_memcg_events_mm(struct mm_struct *mm, enum vm_event_item idx, unsigned long count) { } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } static inline void split_page_memcg(struct page *first, unsigned order) { } static inline void folio_split_memcg_refs(struct folio *folio, unsigned old_order, unsigned new_order) { } static inline u64 cgroup_id_from_mm(struct mm_struct *mm) { return 0; } static inline int mem_cgroup_init(void) { return 0; } #endif /* CONFIG_MEMCG */ /* * Extended information for slab objects stored as an array in page->memcg_data * if MEMCG_DATA_OBJEXTS is set. */ struct slabobj_ext { #ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING union codetag_ref ref; #endif } __aligned(8); static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, 1); } static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, -1); } static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; memcg = lruvec_memcg(lruvec); if (!memcg) return NULL; memcg = parent_mem_cgroup(memcg); if (!memcg) return NULL; return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } static inline void unlock_page_lruvec(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); } static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); } static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); } /* Test requires a stable folio->memcg binding, see folio_memcg() */ static inline bool folio_matches_lruvec(struct folio *folio, struct lruvec *lruvec) { return lruvec_pgdat(lruvec) == folio_pgdat(folio) && lruvec_memcg(lruvec) == folio_memcg(folio); } /* Don't lock again iff page's lruvec locked */ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, struct lruvec *locked_lruvec) { if (locked_lruvec) { if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irq(locked_lruvec); } return folio_lruvec_lock_irq(folio); } /* Don't lock again iff folio's lruvec locked */ static inline void folio_lruvec_relock_irqsave(struct folio *folio, struct lruvec **lruvecp, unsigned long *flags) { if (*lruvecp) { if (folio_matches_lruvec(folio, *lruvecp)) return; unlock_page_lruvec_irqrestore(*lruvecp, *flags); } *lruvecp = folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback); void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb); static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; memcg = folio_memcg(folio); if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); #else /* CONFIG_CGROUP_WRITEBACK */ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) { return NULL; } static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback) { } static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { } static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) { } #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk); bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages, gfp_t gfp_mask); void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages); #if BITS_PER_LONG < 64 static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg) { u64 val = get_jiffies_64() + HZ; unsigned long flags; write_seqlock_irqsave(&memcg->socket_pressure_seqlock, flags); memcg->socket_pressure = val; write_sequnlock_irqrestore(&memcg->socket_pressure_seqlock, flags); } static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg) { unsigned int seq; u64 val; do { seq = read_seqbegin(&memcg->socket_pressure_seqlock); val = memcg->socket_pressure; } while (read_seqretry(&memcg->socket_pressure_seqlock, seq)); return val; } #else static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg) { WRITE_ONCE(memcg->socket_pressure, jiffies + HZ); } static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg) { return READ_ONCE(memcg->socket_pressure); } #endif int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { } static inline void mem_cgroup_sk_free(struct sock *sk) { } static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) { } static inline bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages, gfp_t gfp_mask) { return false; } static inline void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages) { } static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } #endif #ifdef CONFIG_MEMCG bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); /* * The returned objcg pointer is safe to use without additional * protection within a scope. The scope is defined either by * the current task (similar to the "current" global variable) * or by set_active_memcg() pair. * Please, use obj_cgroup_get() to get a reference if the pointer * needs to be used outside of the local scope. */ struct obj_cgroup *current_obj_cgroup(void); struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); static inline struct obj_cgroup *get_obj_cgroup_from_current(void) { struct obj_cgroup *objcg = current_obj_cgroup(); if (objcg) obj_cgroup_get(objcg); return objcg; } int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); extern struct static_key_false memcg_bpf_enabled_key; static inline bool memcg_bpf_enabled(void) { return static_branch_likely(&memcg_bpf_enabled_key); } extern struct static_key_false memcg_kmem_online_key; static inline bool memcg_kmem_online(void) { return static_branch_likely(&memcg_kmem_online_key); } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { if (memcg_kmem_online()) return __memcg_kmem_charge_page(page, gfp, order); return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { if (memcg_kmem_online()) __memcg_kmem_uncharge_page(page, order); } /* * A helper for accessing memcg's kmem_id, used for getting * corresponding LRU lists. */ static inline int memcg_kmem_id(struct mem_cgroup *memcg) { return memcg ? memcg->kmemcg_id : -1; } struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_events(struct obj_cgroup *objcg, enum vm_event_item idx, unsigned long count) { struct mem_cgroup *memcg; if (!memcg_kmem_online()) return; rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); count_memcg_events(memcg, idx, count); rcu_read_unlock(); } bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); #else static inline bool mem_cgroup_kmem_disabled(void) { return true; } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { } static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { return NULL; } static inline bool memcg_bpf_enabled(void) { return false; } static inline bool memcg_kmem_online(void) { return false; } static inline int memcg_kmem_id(struct mem_cgroup *memcg) { return -1; } static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { return NULL; } static inline void count_objcg_events(struct obj_cgroup *objcg, enum vm_event_item idx, unsigned long count) { } static inline ino_t page_cgroup_ino(struct page *page) { return 0; } static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) { return true; } #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg); #else static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { return true; } static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) { } static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) { } static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ return true; } #endif /* Cgroup v1-related declarations */ #ifdef CONFIG_MEMCG_V1 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); bool mem_cgroup_oom_synchronize(bool wait); static inline bool task_in_memcg_oom(struct task_struct *p) { return p->memcg_in_oom; } static inline void mem_cgroup_enter_user_fault(void) { WARN_ON(current->in_user_fault); current->in_user_fault = 1; } static inline void mem_cgroup_exit_user_fault(void) { WARN_ON(!current->in_user_fault); current->in_user_fault = 0; } void memcg1_swapout(struct folio *folio, swp_entry_t entry); void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages); #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned) { return 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { return false; } static inline bool mem_cgroup_oom_synchronize(bool wait) { return false; } static inline void mem_cgroup_enter_user_fault(void) { } static inline void mem_cgroup_exit_user_fault(void) { } static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry) { } static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) { } #endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */ |
| 7354 6588 7540 7585 56 7494 2 7401 6387 31 4 6396 7374 6389 7383 6573 6541 6376 6286 6380 6312 6532 6366 6398 6557 6539 6578 6574 6548 6577 6578 6575 6577 6545 6576 6578 6553 6536 249 6552 322 6550 6 7 7 6486 6516 6264 53 51 55 52 11 11 53 53 52 22 46 51 53 50 1607 1613 1610 1608 31 31 30 30 2221 2223 2222 2227 2171 2177 2222 2227 2218 2219 57 2175 2216 2221 4 4 4 4 17 17 17 17 17 17 17 1 17 1 17 16 17 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/fs_struct.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/prefetch.h> #include "mount.h" #include "internal.h" struct prepend_buffer { char *buf; int len; }; #define DECLARE_BUFFER(__name, __buf, __len) \ struct prepend_buffer __name = {.buf = __buf + __len, .len = __len} static char *extract_string(struct prepend_buffer *p) { if (likely(p->len >= 0)) return p->buf; return ERR_PTR(-ENAMETOOLONG); } static bool prepend_char(struct prepend_buffer *p, unsigned char c) { if (likely(p->len > 0)) { p->len--; *--p->buf = c; return true; } p->len = -1; return false; } /* * The source of the prepend data can be an optimistic load * of a dentry name and length. And because we don't hold any * locks, the length and the pointer to the name may not be * in sync if a concurrent rename happens, and the kernel * copy might fault as a result. * * The end result will correct itself when we check the * rename sequence count, but we need to be able to handle * the fault gracefully. */ static bool prepend_copy(void *dst, const void *src, int len) { if (unlikely(copy_from_kernel_nofault(dst, src, len))) { memset(dst, 'x', len); return false; } return true; } static bool prepend(struct prepend_buffer *p, const char *str, int namelen) { // Already overflowed? if (p->len < 0) return false; // Will overflow? if (p->len < namelen) { // Fill as much as possible from the end of the name str += namelen - p->len; p->buf -= p->len; prepend_copy(p->buf, str, p->len); p->len = -1; return false; } // Fits fully p->len -= namelen; p->buf -= namelen; return prepend_copy(p->buf, str, namelen); } /** * prepend_name - prepend a pathname in front of current buffer pointer * @p: prepend buffer which contains buffer pointer and allocated length * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. * But since the length cannot be trusted, we need to copy the name very * carefully when doing the prepend_copy(). It also prepends "/" at * the beginning of the name. The sequence number check at the caller will * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * * Load acquire is needed to make sure that we see the new name data even * if we might get the length wrong. */ static bool prepend_name(struct prepend_buffer *p, const struct qstr *name) { const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); return prepend(p, dname, dlen) && prepend_char(p, '/'); } static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, const struct path *root, struct prepend_buffer *p) { while (dentry != root->dentry || &mnt->mnt != root->mnt) { const struct dentry *parent = READ_ONCE(dentry->d_parent); if (dentry == mnt->mnt.mnt_root) { struct mount *m = READ_ONCE(mnt->mnt_parent); struct mnt_namespace *mnt_ns; if (likely(mnt != m)) { dentry = READ_ONCE(mnt->mnt_mountpoint); mnt = m; continue; } /* Global root */ mnt_ns = READ_ONCE(mnt->mnt_ns); /* open-coded is_mounted() to use local mnt_ns */ if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) return 1; // absolute root else return 2; // detached or not attached yet } if (unlikely(dentry == parent)) /* Escaped? */ return 3; prefetch(parent); if (!prepend_name(p, &dentry->d_name)) break; dentry = parent; } return 0; } /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @p: prepend buffer which contains buffer pointer and allocated length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. * It only checks the sequence number of the global rename_lock as any change * in the dentry's d_seq will be preceded by changes in the rename_lock * sequence number. If the sequence number had been changed, it will restart * the whole pathname back-tracing sequence again by taking the rename_lock. * In this case, there is no need to take the RCU read lock as the recursive * parent pointer references will keep the dentry chain alive as long as no * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, struct prepend_buffer *p) { unsigned seq, m_seq = 0; struct prepend_buffer b; int error; rcu_read_lock(); restart_mnt: read_seqbegin_or_lock(&mount_lock, &m_seq); seq = 0; rcu_read_lock(); restart: b = *p; read_seqbegin_or_lock(&rename_lock, &seq); error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b); if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (!(m_seq & 1)) rcu_read_unlock(); if (need_seqretry(&mount_lock, m_seq)) { m_seq = 1; goto restart_mnt; } done_seqretry(&mount_lock, m_seq); if (unlikely(error == 3)) b = *p; if (b.len == p->len) prepend_char(&b, '/'); *p = b; return error; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. * * Returns a pointer into the buffer or an error code if the * path was too long. * * "buflen" should be positive. * * If the path is not reachable from the supplied root, return %NULL. */ char *__d_path(const struct path *path, const struct path *root, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, root, &b) > 0)) return NULL; return extract_string(&b); } char *d_absolute_path(const struct path *path, char *buf, int buflen) { struct path root = {}; DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, &root, &b) > 1)) return ERR_PTR(-EINVAL); return extract_string(&b); } static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) { unsigned seq; do { seq = read_seqbegin(&fs->seq); *root = fs->root; } while (read_seqretry(&fs->seq, seq)); } /** * d_path - return the path of a dentry * @path: path to report * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * * Returns a pointer into the buffer or an error code if the path was * too long. Note: Callers should use the returned pointer, not the passed * in buffer, to use the name! The implementation often starts at an offset * into the buffer, and may leave 0 bytes at the start. * * "buflen" should be positive. */ char *d_path(const struct path *path, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); struct path root; /* * We have various synthetic filesystems that never get mounted. On * these filesystems dentries are never used for lookup purposes, and * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: * * Some pseudo inodes are mountable. When they are mounted * path->dentry == path->mnt->mnt_root. In that case don't call d_dname * and instead have d_path return the mounted path. */ if (path->dentry->d_op && path->dentry->d_op->d_dname && (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); rcu_read_lock(); get_fs_root_rcu(current->fs, &root); if (unlikely(d_unlinked(path->dentry))) prepend(&b, " (deleted)", 11); else prepend_char(&b, 0); prepend_path(path, &root, &b); rcu_read_unlock(); return extract_string(&b); } EXPORT_SYMBOL(d_path); /* * Helper function for dentry_operations.d_dname() members */ char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...) { va_list args; char temp[64]; int sz; va_start(args, fmt); sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; va_end(args); if (sz > sizeof(temp) || sz > buflen) return ERR_PTR(-ENAMETOOLONG); buffer += buflen - sz; return memcpy(buffer, temp, sz); } char *simple_dname(struct dentry *dentry, char *buffer, int buflen) { DECLARE_BUFFER(b, buffer, buflen); /* these dentries are never renamed, so d_lock is not needed */ prepend(&b, " (deleted)", 11); prepend(&b, dentry->d_name.name, dentry->d_name.len); prepend_char(&b, '/'); return extract_string(&b); } /* * Write full pathname from the root of the filesystem into the buffer. */ static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p) { const struct dentry *dentry; struct prepend_buffer b; int seq = 0; rcu_read_lock(); restart: dentry = d; b = *p; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { const struct dentry *parent = dentry->d_parent; prefetch(parent); if (!prepend_name(&b, &dentry->d_name)) break; dentry = parent; } if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (b.len == p->len) prepend_char(&b, '/'); return extract_string(&b); } char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); return __dentry_path(dentry, &b); } EXPORT_SYMBOL(dentry_path_raw); char *dentry_path(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); if (unlikely(d_unlinked(dentry))) prepend(&b, "//deleted", 10); else prepend_char(&b, 0); return __dentry_path(dentry, &b); } static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, struct path *pwd) { unsigned seq; do { seq = read_seqbegin(&fs->seq); *root = fs->root; *pwd = fs->pwd; } while (read_seqretry(&fs->seq, seq)); } /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just * returns the length of the buffer filled (which * includes the ending '\0' character), or a negative * error value. So libc would do something like * * char *getcwd(char * buf, size_t size) * { * int retval; * * retval = sys_getcwd(buf, size); * if (retval >= 0) * return buf; * errno = -retval; * return NULL; * } */ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) { int error; struct path pwd, root; char *page = __getname(); if (!page) return -ENOMEM; rcu_read_lock(); get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); if (unlikely(d_unlinked(pwd.dentry))) { rcu_read_unlock(); error = -ENOENT; } else { unsigned len; DECLARE_BUFFER(b, page, PATH_MAX); prepend_char(&b, 0); if (unlikely(prepend_path(&pwd, &root, &b) > 0)) prepend(&b, "(unreachable)", 13); rcu_read_unlock(); len = PATH_MAX - b.len; if (unlikely(len > PATH_MAX)) error = -ENAMETOOLONG; else if (unlikely(len > size)) error = -ERANGE; else if (copy_to_user(buf, b.buf, len)) error = -EFAULT; else error = len; } __putname(page); return error; } |
| 45 45 45 16 15 15 15 9 15 14 14 3 14 14 2 16 12 11 11 11 11 11 11 11 11 1 12 22 21 21 21 21 1 22 6 6 6 6 6 1 5 2 2 2 2 6 18 18 18 18 18 11 17 2 3 3 419 419 419 419 419 419 419 419 419 2 418 417 417 417 418 431 487 487 487 487 487 487 487 487 486 487 487 487 486 487 479 478 478 477 487 460 460 460 460 460 460 460 417 418 2 417 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: GRE over IP protocol decoder. * * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <net/flow.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/ip_tunnels.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/gre.h> #include <net/dst_metadata.h> #include <net/erspan.h> /* Problems & solutions -------------------- 1. The most important issue is detecting local dead loops. They would cause complete host lockup in transmit, which would be "resolved" by stack overflow or, if queueing is enabled, with infinite looping in net_bh. We cannot track such dead loops during route installation, it is infeasible task. The most general solutions would be to keep skb->encapsulation counter (sort of local ttl), and silently drop packet when it expires. It is a good solution, but it supposes maintaining new variable in ALL skb, even if no tunneling is used. Current solution: xmit_recursion breaks dead loops. This is a percpu counter, since when we enter the first ndo_xmit(), cpu migration is forbidden. We force an exit if this counter reaches RECURSION_LIMIT 2. Networking dead loops would not kill routers, but would really kill network. IP hop limit plays role of "t->recursion" in this case, if we copy it from packet being encapsulated to upper header. It is very good solution, but it introduces two problems: - Routing protocols, using packets with ttl=1 (OSPF, RIP2), do not work over tunnels. - traceroute does not work. I planned to relay ICMP from tunnel, so that this problem would be solved and traceroute output would even more informative. This idea appeared to be wrong: only Linux complies to rfc1812 now (yes, guys, Linux is the only true router now :-)), all routers (at least, in neighbourhood of mine) return only 8 bytes of payload. It is the end. Hence, if we want that OSPF worked or traceroute said something reasonable, we should search for another solution. One of them is to parse packet trying to detect inner encapsulation made by our node. It is difficult or even impossible, especially, taking into account fragmentation. TO be short, ttl is not solution at all. Current solution: The solution was UNEXPECTEDLY SIMPLE. We force DF flag on tunnels with preconfigured hop limit, that is ALL. :-) Well, it does not remove the problem completely, but exponential growth of network traffic is changed to linear (branches, that exceed pmtu are pruned) and tunnel mtu rapidly degrades to value <68, where looping stops. Yes, it is not good if there exists a router in the loop, which does not force DF, even when encapsulating packets have DF set. But it is not our problem! Nobody could accuse us, we made all that we could make. Even if it is your gated who injected fatal route to network, even if it were you who configured fatal static route: you are innocent. :-) Alexey Kuznetsov. */ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static const struct header_ops ipgre_header_ops; static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, u32 id, u32 index, bool truncate, bool is_ipv4); static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; static unsigned int erspan_net_id __read_mostly; static int ipgre_err(struct sk_buff *skb, u32 info, const struct tnl_ptk_info *tpi) { /* All the routers (except for Linux) return only 8 bytes of packet payload. It means, that precise relaying of ICMP in the real Internet is absolutely infeasible. Moreover, Cisco "wise men" put GRE key to the third word in GRE header. It makes impossible maintaining even soft state for keyed GRE tunnels with enabled checksum. Tell them "thank you". Well, I wonder, rfc1812 was written by Cisco employee, what the hell these idiots break standards established by themselves??? */ struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn; const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; if (tpi->proto == htons(ETH_P_TEB)) itn = net_generic(net, gre_tap_net_id); else if (tpi->proto == htons(ETH_P_ERSPAN) || tpi->proto == htons(ETH_P_ERSPAN2)) itn = net_generic(net, erspan_net_id); else itn = net_generic(net, ipgre_net_id); iph = (const struct iphdr *)(icmp_hdr(skb) + 1); t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->daddr, iph->saddr, tpi->key); if (!t) return -ENOENT; switch (type) { default: case ICMP_PARAMETERPROB: return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; break; case ICMP_REDIRECT: break; } #if IS_ENABLED(CONFIG_IPV6) if (tpi->proto == htons(ETH_P_IPV6)) { unsigned int data_len = 0; if (type == ICMP_TIME_EXCEEDED) data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, type, data_len)) return 0; } #endif if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) return 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) return 0; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; return 0; } static void gre_err(struct sk_buff *skb, u32 info) { /* All the routers (except for Linux) return only * 8 bytes of packet payload. It means, that precise relaying of * ICMP in the real Internet is absolutely infeasible. * * Moreover, Cisco "wise men" put GRE key to the third word * in GRE header. It makes impossible maintaining even soft * state for keyed * GRE tunnels with enabled checksum. Tell them "thank you". * * Well, I wonder, rfc1812 was written by Cisco employee, * what the hell these idiots break standards established * by themselves??? */ const struct iphdr *iph = (struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct tnl_ptk_info tpi; if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP), iph->ihl * 4) < 0) return; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, skb->dev->ifindex, IPPROTO_GRE); return; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, IPPROTO_GRE); return; } ipgre_err(skb, info, &tpi); } static bool is_erspan_type1(int gre_hdr_len) { /* Both ERSPAN type I (version 0) and type II (version 1) use * protocol 0x88BE, but the type I has only 4-byte GRE header, * while type II has 8-byte. */ return gre_hdr_len == 4; } static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int gre_hdr_len) { struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; struct erspan_base_hdr *ershdr; IP_TUNNEL_DECLARE_FLAGS(flags); struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; const struct iphdr *iph; struct erspan_md2 *md2; int ver; int len; ip_tunnel_flags_copy(flags, tpi->flags); itn = net_generic(net, erspan_net_id); iph = ip_hdr(skb); if (is_erspan_type1(gre_hdr_len)) { ver = 0; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); } else { if (unlikely(!pskb_may_pull(skb, gre_hdr_len + sizeof(*ershdr)))) return PACKET_REJECT; ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; iph = ip_hdr(skb); __set_bit(IP_TUNNEL_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, tpi->key); } if (tunnel) { if (is_erspan_type1(gre_hdr_len)) len = gre_hdr_len; else len = gre_hdr_len + erspan_hdr_len(ver); if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; if (__iptunnel_pull_header(skb, len, htons(ETH_P_TEB), false, false) < 0) goto drop; if (tunnel->collect_md) { struct erspan_metadata *pkt_md, *md; struct ip_tunnel_info *info; unsigned char *gh; __be64 tun_id; __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, tun_id, sizeof(*md)); if (!tun_dst) return PACKET_REJECT; /* skb can be uncloned in __iptunnel_pull_header, so * old pkt_md is no longer valid and we need to reset * it */ gh = skb_network_header(skb) + skb_network_header_len(skb); pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + sizeof(*ershdr)); md = ip_tunnel_info_opts(&tun_dst->u.tun_info); md->version = ver; md2 = &md->u.md2; memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); info = &tun_dst->u.tun_info; __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); info->options_len = sizeof(*md); } skb_reset_mac_header(skb); ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } return PACKET_REJECT; drop: kfree_skb(skb); return PACKET_RCVD; } static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) { struct metadata_dst *tun_dst = NULL; const struct iphdr *iph; struct ip_tunnel *tunnel; iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->saddr, iph->daddr, tpi->key); if (tunnel) { const struct iphdr *tnl_params; if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, raw_proto, false) < 0) goto drop; /* Special case for ipgre_header_parse(), which expects the * mac_header to point to the outer IP header. */ if (tunnel->dev->header_ops == &ipgre_header_ops) skb_pop_mac_header(skb); else skb_reset_mac_header(skb); tnl_params = &tunnel->parms.iph; if (tunnel->collect_md || tnl_params->daddr == 0) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __be64 tun_id; __set_bit(IP_TUNNEL_CSUM_BIT, flags); __set_bit(IP_TUNNEL_KEY_BIT, flags); ip_tunnel_flags_and(flags, tpi->flags, flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); if (!tun_dst) return PACKET_REJECT; } ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } return PACKET_NEXT; drop: kfree_skb(skb); return PACKET_RCVD; } static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, int hdr_len) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn; int res; if (tpi->proto == htons(ETH_P_TEB)) itn = net_generic(net, gre_tap_net_id); else itn = net_generic(net, ipgre_net_id); res = __ipgre_rcv(skb, tpi, itn, hdr_len, false); if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) { /* ipgre tunnels in collect metadata mode should receive * also ETH_P_TEB traffic. */ itn = net_generic(net, ipgre_net_id); res = __ipgre_rcv(skb, tpi, itn, hdr_len, true); } return res; } static int gre_rcv(struct sk_buff *skb) { struct tnl_ptk_info tpi; bool csum_err = false; int hdr_len; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { /* Looped back packet, drop it! */ if (rt_is_output_route(skb_rtable(skb))) goto drop; } #endif hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0); if (hdr_len < 0) goto drop; if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || tpi.proto == htons(ETH_P_ERSPAN2))) { if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; goto out; } if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); IP_TUNNEL_DECLARE_FLAGS(flags); ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); /* Push GRE header. */ gre_build_header(skb, tunnel->tun_hlen, flags, proto, tunnel->parms.o_key, test_bit(IP_TUNNEL_SEQ_BIT, flags) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } static int gre_handle_offloads(struct sk_buff *skb, bool csum) { return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; int tunnel_hlen; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET)) goto err_free_skb; key = &tun_info->key; tunnel_hlen = gre_calc_hlen(key->tun_flags); if (skb_cow_head(skb, dev->needed_headroom)) goto err_free_skb; /* Push Tunnel header. */ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags))) goto err_free_skb; __set_bit(IP_TUNNEL_CSUM_BIT, flags); __set_bit(IP_TUNNEL_KEY_BIT, flags); __set_bit(IP_TUNNEL_SEQ_BIT, flags); ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags); gre_build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key32(tun_info->key.tun_id), test_bit(IP_TUNNEL_SEQ_BIT, flags) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); return; err_free_skb: kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); } static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct erspan_metadata *md; bool truncate = false; __be16 proto; int tunnel_hlen; int version; int nhoff; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET)) goto err_free_skb; key = &tun_info->key; if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) goto err_free_skb; if (tun_info->options_len < sizeof(*md)) goto err_free_skb; md = ip_tunnel_info_opts(tun_info); /* ERSPAN has fixed 8 byte GRE header */ version = md->version; tunnel_hlen = 8 + erspan_hdr_len(version); if (skb_cow_head(skb, dev->needed_headroom)) goto err_free_skb; if (gre_handle_offloads(skb, false)) goto err_free_skb; if (skb->len > dev->mtu + dev->hard_header_len) { if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) goto err_free_skb; truncate = true; } nhoff = skb_network_offset(skb); if (skb->protocol == htons(ETH_P_IP) && (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) truncate = true; if (skb->protocol == htons(ETH_P_IPV6)) { int thoff; if (skb_transport_header_was_set(skb)) thoff = skb_transport_offset(skb); else thoff = nhoff + sizeof(struct ipv6hdr); if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) truncate = true; } if (version == 1) { erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), ntohl(md->u.index), truncate, true); proto = htons(ETH_P_ERSPAN); } else if (version == 2) { erspan_build_header_v2(skb, ntohl(tunnel_id_to_key32(key->tun_id)), md->u.md2.dir, get_hwid(&md->u.md2), truncate, true); proto = htons(ETH_P_ERSPAN2); } else { goto err_free_skb; } __set_bit(IP_TUNNEL_SEQ_BIT, flags); gre_build_header(skb, 8, flags, proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno))); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); return; err_free_skb: kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); } static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct ip_tunnel_key *key; struct rtable *rt; struct flowi4 fl4; if (ip_tunnel_info_af(info) != AF_INET) return -EINVAL; key = &info->key; ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), key->tos & ~INET_ECN_MASK, dev_net(dev), 0, skb->mark, skb_get_hash(skb), key->flow_flags); rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); info->key.u.ipv4.src = fl4.saddr; return 0; } static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; if (!pskb_inet_may_pull(skb)) goto free_skb; if (tunnel->collect_md) { gre_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; } if (dev->header_ops) { int pull_len = tunnel->hlen + sizeof(struct iphdr); if (skb_cow_head(skb, 0)) goto free_skb; if (!pskb_may_pull(skb, pull_len)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; /* ip_tunnel_xmit() needs skb->data pointing to gre header. */ skb_pull(skb, pull_len); skb_reset_mac_header(skb); if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start(skb) < skb->data) goto free_skb; } else { if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; tnl_params = &tunnel->parms.iph; } if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags))) goto free_skb; __gre_xmit(skb, dev, tnl_params, skb->protocol); return NETDEV_TX_OK; free_skb: kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); bool truncate = false; __be16 proto; if (!pskb_inet_may_pull(skb)) goto free_skb; if (tunnel->collect_md) { erspan_fb_xmit(skb, dev); return NETDEV_TX_OK; } if (gre_handle_offloads(skb, false)) goto free_skb; if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; if (skb->len > dev->mtu + dev->hard_header_len) { if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) goto free_skb; truncate = true; } /* Push ERSPAN header */ if (tunnel->erspan_ver == 0) { proto = htons(ETH_P_ERSPAN); __clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags); } else if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, truncate, true); proto = htons(ETH_P_ERSPAN); } else if (tunnel->erspan_ver == 2) { erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key), tunnel->dir, tunnel->hwid, truncate, true); proto = htons(ETH_P_ERSPAN2); } else { goto free_skb; } __clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags); __gre_xmit(skb, dev, &tunnel->parms.iph, proto); return NETDEV_TX_OK; free_skb: kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); if (!pskb_inet_may_pull(skb)) goto free_skb; if (tunnel->collect_md) { gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; } if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags))) goto free_skb; if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); return NETDEV_TX_OK; free_skb: kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } static void ipgre_link_update(struct net_device *dev, bool set_mtu) { struct ip_tunnel *tunnel = netdev_priv(dev); int len; len = tunnel->tun_hlen; tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); len = tunnel->tun_hlen - len; tunnel->hlen = tunnel->hlen + len; if (dev->header_ops) dev->hard_header_len += len; else dev->needed_headroom += len; if (set_mtu) WRITE_ONCE(dev->mtu, max_t(int, dev->mtu - len, 68)); if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) || (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && tunnel->encap.type != TUNNEL_ENCAP_NONE)) { dev->features &= ~NETIF_F_GSO_SOFTWARE; dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; } else { dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; } } static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { __be16 i_flags, o_flags; int err; if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || !ip_tunnel_flags_is_be16_compat(p->o_flags)) return -EOVERFLOW; i_flags = ip_tunnel_flags_to_be16(p->i_flags); o_flags = ip_tunnel_flags_to_be16(p->o_flags); if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) || ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } gre_flags_to_tnl_flags(p->i_flags, i_flags); gre_flags_to_tnl_flags(p->o_flags, o_flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd == SIOCCHGTUNNEL) { struct ip_tunnel *t = netdev_priv(dev); ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); if (strcmp(dev->rtnl_link_ops->kind, "erspan")) ipgre_link_update(dev, true); } i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); ip_tunnel_flags_from_be16(p->i_flags, i_flags); o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); ip_tunnel_flags_from_be16(p->o_flags, o_flags); return 0; } /* Nice toy. Unfortunately, useless in real life :-) It allows to construct virtual multiprotocol broadcast "LAN" over the Internet, provided multicast routing is tuned. I have no idea was this bicycle invented before me, so that I had to set ARPHRD_IPGRE to a random value. I have an impression, that Cisco could make something similar, but this feature is apparently missing in IOS<=11.2(8). I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks with broadcast 224.66.66.66. If you have access to mbone, play with me :-) ping -t 255 224.66.66.66 If nobody answers, mbone does not work. ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255 ip addr add 10.66.66.<somewhat>/24 dev Universe ifconfig Universe up ifconfig Universe add fe80::<Your_real_addr>/10 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96 ftp 10.66.66.66 ... ftp fec0:6666:6666::193.233.7.65 ... */ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ip_tunnel *t = netdev_priv(dev); struct iphdr *iph; struct gre_base_hdr *greh; iph = skb_push(skb, t->hlen + sizeof(*iph)); greh = (struct gre_base_hdr *)(iph+1); greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags); greh->protocol = htons(type); memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); /* Set the source hardware address. */ if (saddr) memcpy(&iph->saddr, saddr, 4); if (daddr) memcpy(&iph->daddr, daddr, 4); if (iph->daddr) return t->hlen + sizeof(*iph); return -(t->hlen + sizeof(*iph)); } static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) { const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); memcpy(haddr, &iph->saddr, 4); return 4; } static const struct header_ops ipgre_header_ops = { .create = ipgre_header, .parse = ipgre_header_parse, }; #ifdef CONFIG_NET_IPGRE_BROADCAST static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { struct flowi4 fl4 = { .flowi4_oif = t->parms.link, .flowi4_dscp = ip4h_dscp(&t->parms.iph), .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_proto = IPPROTO_GRE, .saddr = t->parms.iph.saddr, .daddr = t->parms.iph.daddr, .fl4_gre_key = t->parms.o_key, }; struct rtable *rt; rt = ip_route_output_key(t->net, &fl4); if (IS_ERR(rt)) return -EADDRNOTAVAIL; dev = rt->dst.dev; ip_rt_put(rt); if (!__in_dev_get_rtnl(dev)) return -EADDRNOTAVAIL; t->mlink = dev->ifindex; ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); } return 0; } static int ipgre_close(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; in_dev = inetdev_by_index(t->net, t->mlink); if (in_dev) ip_mc_dec_group(in_dev, t->parms.iph.daddr); } return 0; } #endif static const struct net_device_ops ipgre_netdev_ops = { .ndo_init = ipgre_tunnel_init, .ndo_uninit = ip_tunnel_uninit, #ifdef CONFIG_NET_IPGRE_BROADCAST .ndo_open = ipgre_open, .ndo_stop = ipgre_close, #endif .ndo_start_xmit = ipgre_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipgre_tunnel_ctl, }; #define GRE_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_HW_CSUM) static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; dev->type = ARPHRD_IPGRE; ip_tunnel_setup(dev, ipgre_net_id); } static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; tunnel = netdev_priv(dev); tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph); dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; /* TCP offload with GRE SEQ is not supported, nor can we support 2 * levels of outer headers requiring an update. */ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags)) return; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && tunnel->encap.type != TUNNEL_ENCAP_NONE) return; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->lltx = true; } static int ipgre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; __gre_tunnel_init(dev); __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; netif_keep_dst(dev); dev->addr_len = 4; if (iph->daddr && !tunnel->collect_md) { #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { if (!iph->saddr) return -EINVAL; dev->flags = IFF_BROADCAST; dev->header_ops = &ipgre_header_ops; dev->hard_header_len = tunnel->hlen + sizeof(*iph); dev->needed_headroom = 0; } #endif } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; dev->hard_header_len = tunnel->hlen + sizeof(*iph); dev->needed_headroom = 0; } return ip_tunnel_init(dev); } static const struct gre_protocol ipgre_protocol = { .handler = gre_rcv, .err_handler = gre_err, }; static int __net_init ipgre_init_net(struct net *net) { return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } static void __net_exit ipgre_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ip_tunnel_de |